diff --git a/.gitattributes b/.gitattributes index 5a815654b4c..bede44edf8a 100644 --- a/.gitattributes +++ b/.gitattributes @@ -15,4 +15,6 @@ windows/INSTALL* eol=native windows/NewGuidCmd.exe.config text eol=crlf windows/NewGuidCmd.exe binary +# Prevent git changing CR-LF to LF when archiving (patch requires CR-LF on Windows). +**/*.patch -text diff --git a/.gitignore b/.gitignore index 961c8d64dc0..97a9cfff255 100644 --- a/.gitignore +++ b/.gitignore @@ -6,11 +6,12 @@ !/src/*/Makefile !/src/*/README -# Compiled Object files +# Compiled Object files and python ciles *.slo *.lo *.o *.obj +*.pyc # Compiled Dynamic libraries *.so @@ -81,6 +82,8 @@ GSYMS /tools/portaudio/ /tools/sctk-2.4.0-20091110-0958.tar.bz2 /tools/sctk-2.4.0/ +/tools/sctk-2.4.10-20151007-1312Z.tar.bz2 +/tools/sctk-2.4.10/ /tools/sph2pipe_v2.5.tar.gz /tools/sph2pipe_v2.5/ /tools/kaldi_lm.tar.gz @@ -98,5 +101,9 @@ GSYMS /tools/mpg123 /tools/mpg123-1.21.0.tar.bz2 /tools/mpg123-1.21.0 +/tools/pthreads +/tools/pthreads*.zip /tools/sequitur /tools/srilm.tgz + +/kaldiwin_vs* diff --git a/egs/ami/s5/RESULTS_ihm b/egs/ami/s5/RESULTS_ihm index 234a434afb4..6435e9df47b 100644 --- a/egs/ami/s5/RESULTS_ihm +++ b/egs/ami/s5/RESULTS_ihm @@ -6,6 +6,9 @@ for x in exp/ihm/{mono,tri,sgmm,nnet,dnn,lstm}*/decode*; do [ -d $x ] && [[ $x = exit 0 +# Results with close-talk microphones (IHM), + +# Pawel, dev exp/ihm/tri2a/decode_dev_ami_fsh.o3g.kn.pr1-7/ascore_13/dev.ctm.filt.dtl:Percent Total Error = 38.0% (35925) exp/ihm/tri3a/decode_dev_ami_fsh.o3g.kn.pr1-7/ascore_14/dev.ctm.filt.dtl:Percent Total Error = 35.3% (33329) @@ -18,14 +21,58 @@ exp/ihm/tri3a/decode_eval_ami_fsh.o3g.kn.pr1-7/ascore_14/eval.ctm.filt.dtl:Perce exp/ihm/tri4a/decode_eval_ami_fsh.o3g.kn.pr1-7/ascore_13/eval_o4.ctm.filt.dtl:Percent Total Error = 35.0% (31463) exp/ihm/tri4a_mmi_b0.1/decode_eval_3.mdl_ami_fsh.o3g.kn.pr1-7/ascore_12/eval_o4.ctm.filt.dtl:Percent Total Error = 31.7% (28518) +# Karel, JSALT 2015, (21.7.2015) + +# dev, +## GMM, +%WER 38.1 | 13098 94489 | 67.1 20.6 12.2 5.2 38.1 67.0 | exp/ihm/tri2a/decode_dev_ami_fsh.o3g.kn.pr1-7/ascore_13/dev.ctm.filt.sys +%WER 35.5 | 13098 94487 | 69.6 19.0 11.4 5.1 35.5 65.8 | exp/ihm/tri3a/decode_dev_ami_fsh.o3g.kn.pr1-7/ascore_13/dev.ctm.filt.sys +%WER 32.2 | 13098 94483 | 72.5 17.2 10.3 4.8 32.2 63.8 | exp/ihm/tri4a/decode_dev_ami_fsh.o3g.kn.pr1-7/ascore_14/dev.ctm.filt.sys #0.1% worse than Pawel! +%WER 30.2 | 13098 94479 | 74.0 15.6 10.4 4.2 30.2 61.9 | exp/ihm/tri4a_mmi_b0.1/decode_dev_3.mdl_ami_fsh.o3g.kn.pr1-7/ascore_12/dev.ctm.filt.sys +## DNN-Xent, +%WER 26.0 | 13098 94483 | 77.9 13.5 8.5 4.0 26.0 58.4 | exp/ihm/dnn4_pretrain-dbn_dnn/decode_dev_ami_fsh.o3g.kn.pr1-7/ascore_11/dev.ctm.filt.sys +## DNN-sMBR, +%WER 24.9 | 13098 94484 | 79.2 13.2 7.6 4.1 24.9 57.1 | exp/ihm/dnn4_pretrain-dbn_dnn_smbr/decode_dev_ami_fsh.o3g.kn.pr1-7_it1/ascore_11/dev.ctm.filt.sys +%WER 24.3 | 13098 94481 | 79.6 12.6 7.8 3.9 24.3 56.3 | exp/ihm/dnn4_pretrain-dbn_dnn_smbr/decode_dev_ami_fsh.o3g.kn.pr1-7_it4/ascore_12/dev.ctm.filt.sys + +# eval, +## GMM, +%WER 43.9 | 12643 89978 | 60.8 25.3 13.9 4.8 43.9 65.6 | exp/ihm/tri2a/decode_eval_ami_fsh.o3g.kn.pr1-7/ascore_14/eval.ctm.filt.sys +%WER 40.8 | 12643 89985 | 63.8 23.6 12.6 4.7 40.8 64.6 | exp/ihm/tri3a/decode_eval_ami_fsh.o3g.kn.pr1-7/ascore_14/eval.ctm.filt.sys +%WER 35.1 | 12643 89975 | 69.1 19.8 11.1 4.2 35.1 61.8 | exp/ihm/tri4a/decode_eval_ami_fsh.o3g.kn.pr1-7/ascore_14/eval.ctm.filt.sys #0.1% worse than Pawel! +%WER 31.7 | 12643 89986 | 72.1 18.0 9.9 3.8 31.7 59.4 | exp/ihm/tri4a_mmi_b0.1/decode_eval_4.mdl_ami_fsh.o3g.kn.pr1-7/ascore_11/eval.ctm.filt.sys +## DNN-Xent, +%WER 27.1 | 12643 89971 | 76.4 15.5 8.1 3.5 27.1 57.2 | exp/ihm/dnn4_pretrain-dbn_dnn/decode_eval_ami_fsh.o3g.kn.pr1-7/ascore_10/eval.ctm.filt.sys +## DNN-sMBR, +%WER 25.4 | 12643 89974 | 77.9 14.7 7.4 3.3 25.4 55.1 | exp/ihm/dnn4_pretrain-dbn_dnn_smbr/decode_eval_ami_fsh.o3g.kn.pr1-7_it1/ascore_11/eval.ctm.filt.sys +%WER 24.6 | 12643 89972 | 78.8 14.1 7.1 3.3 24.6 54.4 | exp/ihm/dnn4_pretrain-dbn_dnn_smbr/decode_eval_ami_fsh.o3g.kn.pr1-7_it4/ascore_11/eval.ctm.filt.sys + -# TDNN results +# Vijay, TDNN results, for x in exp/$mic/nnet2_online/*/decode*; do [ -d $x ] && [[ $x =~ "$1" ]] && grep Sum $x/ascore_*/*.sys | utils/best_wer.sh; done -#dev +# dev, %WER 25.0 | 13098 94483 | 78.3 12.0 9.6 3.4 25.0 57.7 | exp/ihm/nnet2_online/nnet_ms_sp_online/decode_dev/ascore_13/dev_hires.ctm.filt.sys %WER 25.3 | 13098 94468 | 78.5 12.7 8.8 3.8 25.3 57.9 | exp/ihm/nnet2_online/nnet_ms_sp_online/decode_dev_utt/ascore_12/dev_hires.ctm.filt.sys %WER 25.0 | 13098 94476 | 78.5 12.4 9.1 3.6 25.0 58.0 | exp/ihm/nnet2_online/nnet_ms_sp_online/decode_dev_utt_offline/ascore_13/dev_hires.ctm.filt.sys -#eval +# eval, %WER 25.9 | 12643 89971 | 77.2 14.2 8.6 3.2 25.9 56.4 | exp/ihm/nnet2_online/nnet_ms_sp_online/decode_eval/ascore_12/eval_hires.ctm.filt.sys %WER 26.0 | 12643 89976 | 77.1 14.7 8.2 3.2 26.0 55.7 | exp/ihm/nnet2_online/nnet_ms_sp_online/decode_eval_utt/ascore_12/eval_hires.ctm.filt.sys %WER 25.8 | 12643 89978 | 77.6 14.6 7.8 3.4 25.8 55.8 | exp/ihm/nnet2_online/nnet_ms_sp_online/decode_eval_utt_offline/ascore_11/eval_hires.ctm.filt.sys + +#------------------------------------------------------------------------------------------------------------------------------------ +# Nnet3 systems + +# BLSTM +# local/nnet3/run_blstm.sh --mic ihm \ +# --chunk-right-context 20 \ +# --use-sat-alignments true +# Note: Chunk right context of 20 limits the latency of the acoustic model to +# 20 frames. + +%WER 22.8 | 13098 94494 | 80.1 11.0 8.9 3.0 22.8 54.8 | exp/ihm/nnet3/lstm_bidirectional_ld0/decode_dev/ascore_10/dev_hires.ctm.filt.sys +%WER 22.6 | 12643 89969 | 80.0 12.7 7.3 2.7 22.6 53.5 | exp/ihm/nnet3/lstm_bidirectional_ld0/decode_eval/ascore_9/eval_hires.ctm.filt.sys + +## Chain systems + # local/chain/run_tdnn_ami_5.sh --mic ihm --max-wer 50 --affix min_seg_len2_50wer (built with min-seg-len 2 secs, but script now just supports (frames_per_eg+5)/100) + %WER 22.4 | 13098 94484 | 80.5 10.7 8.8 3.0 22.4 54.8 | 0.091 | exp/ihm/chain/tdnn_min_seg_len2_50wer_sp/decode_dev/ascore_10/dev_hires.ctm.filt.sys + %WER 22.4 | 12643 89973 | 80.3 12.6 7.1 2.8 22.4 53.2 | 0.155 | exp/ihm/chain/tdnn_min_seg_len2_50wer_sp/decode_eval/ascore_10/eval_hires.ctm.filt.sys diff --git a/egs/ami/s5/RESULTS_mdm b/egs/ami/s5/RESULTS_mdm index d0cbb335bd8..757f6a4d227 100644 --- a/egs/ami/s5/RESULTS_mdm +++ b/egs/ami/s5/RESULTS_mdm @@ -6,8 +6,9 @@ for x in exp/mdm*/{mono,tri,sgmm,nnet,dnn,lstm}*/decode*; do [ -d $x ] && [[ $x exit 0 -#Beamforming of 8 microphones, WER scores with up to 4 overlapping speakers +# Beamforming of 8 microphones, WER scores with up to 4 overlapping speakers, +# Pawel, dev exp/mdm8/tri2a/decode_dev_ami_fsh.o3g.kn.pr1-7/ascore_13/dev_o4.ctm.filt.dtl:Percent Total Error = 58.8% (55568) exp/mdm8/tri3a/decode_dev_ami_fsh.o3g.kn.pr1-7/ascore_13/dev_o4.ctm.filt.dtl:Percent Total Error = 57.0% (53855) @@ -35,7 +36,7 @@ for x in exp/$mic/nnet2_online/*/decode*; do [ -d $x ] && [[ $x =~ "$1" ]] && gr #dev %WER 40.9 | 15965 94490 | 64.6 19.9 15.5 5.5 40.9 61.9 | -26.104 | exp/mdm8/nnet2_online/nnet_ms_sp_smbr_0.000005/decode_epoch4_dev_utt/ascore_12/dev_hires_o4.ctm.filt.sys %WER 40.7 | 13961 94495 | 64.4 18.8 16.8 5.0 40.7 70.4 | -26.622 | exp/mdm8/nnet2_online/nnet_ms_sp_smbr_0.000005/decode_epoch4_dev_utt_offline/ascore_13/dev_hires_o4.ctm.filt.sys - #eval + #eval %WER 44.2 | 13577 89767 | 61.1 22.3 16.6 5.3 44.2 68.9 | -25.003 | exp/mdm8/nnet2_online/nnet_ms_sp_smbr_0.000005/decode_epoch4_eval_utt/ascore_11/eval_hires_o4.ctm.filt.sys %WER 44.0 | 13448 89769 | 60.8 21.4 17.8 4.9 44.0 69.6 | -25.331 | exp/mdm8/nnet2_online/nnet_ms_sp_smbr_0.000005/decode_epoch4_eval_utt_offline/ascore_12/eval_hires_o4.ctm.filt.sys @@ -57,8 +58,8 @@ for x in exp/$mic/nnet2_online/*/decode*; do [ -d $x ] && [[ $x =~ "$1" ]] && gr %WER 40.9 | 15965 94490 | 64.6 19.9 15.5 5.5 40.9 61.9 | -26.104 | exp/mdm8/nnet2_online/nnet_ms_sp_smbr_0.000005/decode_epoch4_dev_utt/ascore_12/dev_hires_o4.ctm.filt.sys %WER 40.7 | 13961 94495 | 64.4 18.8 16.8 5.0 40.7 70.4 | -26.622 | exp/mdm8/nnet2_online/nnet_ms_sp_smbr_0.000005/decode_epoch4_dev_utt_offline/ascore_13/dev_hires_o4.ctm.filt.sys - #eval - # epoch 0 + #eval + # epoch 0 %WER 45.4 | 13992 89799 | 60.1 26.4 13.5 5.5 45.4 67.3 | -23.969 | exp/mdm8/nnet2_online/nnet_ms_sp_smbr_0.000005/decode_epoch0_eval_utt/ascore_12/eval_hires_o4.ctm.filt.sys %WER 45.1 | 13893 89804 | 60.3 25.9 13.9 5.4 45.1 67.9 | -24.110 | exp/mdm8/nnet2_online/nnet_ms_sp_smbr_0.000005/decode_epoch0_eval_utt_offline/ascore_12/eval_hires_o4.ctm.filt.sys # epoch 1 @@ -74,3 +75,56 @@ for x in exp/$mic/nnet2_online/*/decode*; do [ -d $x ] && [[ $x =~ "$1" ]] && gr %WER 44.2 | 13577 89767 | 61.1 22.3 16.6 5.3 44.2 68.9 | -25.003 | exp/mdm8/nnet2_online/nnet_ms_sp_smbr_0.000005/decode_epoch4_eval_utt/ascore_11/eval_hires_o4.ctm.filt.sys %WER 44.0 | 13448 89769 | 60.8 21.4 17.8 4.9 44.0 69.6 | -25.331 | exp/mdm8/nnet2_online/nnet_ms_sp_smbr_0.000005/decode_epoch4_eval_utt_offline/ascore_12/eval_hires_o4.ctm.filt.sys + +#------------------------------------------------------------------------------------------------------------------------------------ +# Nnet3 systems + +# BLSTM + clean alignments +# local/nnet3/run_blstm.sh --mic mdm8\ +# --chunk-right-context 20 \ +# --use-sat-alignments true \ +# --use-ihm-ali true +# Note: Chunk right context of 20 limits the latency of the acoustic model to +# 20 frames. + + + %WER 35.5 | 15221 94509 | 69.9 21.0 9.1 5.4 35.5 61.4 | -26.440 | exp/mdm8_cleanali/nnet3/lstm_bidirectional_ld0/decode_dev/ascore_10/dev_hires_o4.ctm.filt.sys + %WER 38.3 | 13423 89786 | 65.8 22.0 12.2 4.1 38.3 66.3 | -26.016 | exp/mdm8_cleanali/nnet3/lstm_bidirectional_ld0/decode_eval/ascore_10/eval_hires_o4.ctm.filt.sys + + +################################# Chain Systems ###################### + + # local/chain/run_tdnn_ami_5.sh --mic mdm8 --affix msl1.5_45wer + %WER 38.5 | 14761 94496 | 65.5 17.6 16.9 4.0 38.5 66.5 | 0.620 | exp/mdm8/chain/tdnn_ami5_msl1.5_45wer_sp/decode_dev/ascore_9/dev_hires_o4.ctm.filt.sys + %WER 41.5 | 14219 89974 | 62.2 18.5 19.2 3.7 41.5 65.6 | 0.596 | exp/mdm8/chain/tdnn_ami5_msl1.5_45wer_sp/decode_eval/ascore_9/eval_hires_o4.ctm.filt.sys + + # local/chain/run_tdnn_ami_5.sh --mic mdm8 --use-ihm-ali true --affix msl1.5_45wer + %WER 38.1 | 15296 94487 | 65.7 17.9 16.4 3.8 38.1 62.5 | 0.617 | exp/mdm8_cleanali/chain/tdnn_ami5_msl1.5_45wer_sp/decode_dev/ascore_9/dev_hires_o4.ctm.filt.sys + %WER 41.5 | 13795 89975 | 62.6 20.4 17.0 4.1 41.5 66.9 | 0.628 | exp/mdm8_cleanali/chain/tdnn_ami5_msl1.5_45wer_sp/decode_eval/ascore_8/eval_hires_o4.ctm.filt.sys + +#-------------------------------------------------------------------------------------------------------------------------------------------- +# Karel, JSALT 2015, (31.7.2015) +# nnet1, MFCC-LDA-MLLT-DNN system (local/nnet/run_dnn_lda_mllt.sh), + +# dev, +## GMM, +%WER 59.1 | 14105 94500 | 47.5 34.3 18.2 6.6 59.1 76.0 | -22.348 | exp/mdm8/tri2a/decode_dev_ami_fsh.o3g.kn.pr1-7/ascore_13/dev_o4.ctm.filt.sys +%WER 57.2 | 14807 94503 | 49.6 33.2 17.3 6.8 57.2 72.1 | -22.450 | exp/mdm8/tri3a/decode_dev_ami_fsh.o3g.kn.pr1-7/ascore_13/dev_o4.ctm.filt.sys +%WER 55.0 | 14511 94490 | 51.1 30.0 18.8 6.2 55.0 73.0 | -22.760 | exp/mdm8/tri3a_mmi_b0.1/decode_dev_2.mdl_ami_fsh.o3g.kn.pr1-7/ascore_11/dev_o4.ctm.filt.sys +## DNN-Xent, +%WER 48.2 | 15246 94513 | 58.4 28.7 12.9 6.7 48.2 67.3 | -23.329 | exp/mdm8/dnn4noSAT_pretrain-dbn_dnn/decode_dev_ami_fsh.o3g.kn.pr1-7/ascore_13/dev_o4.ctm.filt.sys +## DNN-sMBR, +%WER 46.2 | 15260 94500 | 60.1 26.0 13.9 6.3 46.2 66.3 | -23.908 | exp/mdm8/dnn4noSAT_pretrain-dbn_dnn_smbr/decode_dev_ami_fsh.o3g.kn.pr1-7_it1/ascore_13/dev_o4.ctm.filt.sys +%WER 45.1 | 14204 94504 | 61.1 24.9 14.0 6.2 45.1 70.7 | -24.225 | exp/mdm8/dnn4noSAT_pretrain-dbn_dnn_smbr/decode_dev_ami_fsh.o3g.kn.pr1-7_it4/ascore_13/dev_o4.ctm.filt.sys + +# eval, +## GMM, +%WER 64.4 | 14362 90002 | 41.7 36.6 21.8 6.0 64.4 71.2 | -22.256 | exp/mdm8/tri2a/decode_eval_ami_fsh.o3g.kn.pr1-7/ascore_12/eval_o4.ctm.filt.sys +%WER 62.1 | 13700 89987 | 44.0 35.5 20.5 6.2 62.1 74.1 | -22.267 | exp/mdm8/tri3a/decode_eval_ami_fsh.o3g.kn.pr1-7/ascore_12/eval_o4.ctm.filt.sys +%WER 59.5 | 13822 89978 | 46.3 32.8 20.9 5.7 59.5 72.6 | -22.394 | exp/mdm8/tri3a_mmi_b0.1/decode_eval_2.mdl_ami_fsh.o3g.kn.pr1-7/ascore_10/eval_o4.ctm.filt.sys +## DNN-Xent, +%WER 52.1 | 13642 89829 | 53.6 30.7 15.7 5.7 52.1 71.6 | -22.884 | exp/mdm8/dnn4noSAT_pretrain-dbn_dnn/decode_eval_ami_fsh.o3g.kn.pr1-7/ascore_12/eval_o4.ctm.filt.sys +## DNN-sMBR, +%WER 50.3 | 14264 89966 | 54.7 27.6 17.8 5.0 50.3 67.5 | -23.397 | exp/mdm8/dnn4noSAT_pretrain-dbn_dnn_smbr/decode_eval_ami_fsh.o3g.kn.pr1-7_it1/ascore_13/eval_o4.ctm.filt.sys +%WER 49.1 | 13969 89982 | 55.8 26.7 17.4 4.9 49.1 68.4 | -23.629 | exp/mdm8/dnn4noSAT_pretrain-dbn_dnn_smbr/decode_eval_ami_fsh.o3g.kn.pr1-7_it4/ascore_13/eval_o4.ctm.filt.sys + diff --git a/egs/ami/s5/RESULTS_sdm b/egs/ami/s5/RESULTS_sdm index d2bcad1f414..362d4019327 100644 --- a/egs/ami/s5/RESULTS_sdm +++ b/egs/ami/s5/RESULTS_sdm @@ -5,8 +5,9 @@ for x in exp/sdm*/{mono,tri,sgmm,nnet,dnn,lstm}*/decode*; do [ -d $x ] && [[ $x for x in exp/sdm*/{mono,tri,sgmm,nnet,dnn,lstm}*/decode*; do [ -d $x ] && [[ $x =~ "$1" ]] && grep Sum $x/ascore_*/*.sys | utils/best_wer.sh; done 2>/dev/null exit 0 -#the below are WER scores with up to 4 overlapping speakers +# The below are WER scores with up to 4 overlapping speakers, +# Pawel, dev exp/sdm1/tri2a/decode_dev_ami_fsh.o3g.kn.pr1-7/ascore_13/dev_o4.ctm.filt.dtl:Percent Total Error = 66.9% (63190) exp/sdm1/tri3a/decode_dev_ami_fsh.o3g.kn.pr1-7/ascore_13/dev_o4.ctm.filt.dtl:Percent Total Error = 64.5% (60963) @@ -17,8 +18,6 @@ exp/sdm1/tri2a/decode_eval_ami_fsh.o3g.kn.pr1-7/ascore_13/eval_o4.ctm.filt.dtl:P exp/sdm1/tri3a/decode_eval_ami_fsh.o3g.kn.pr1-7/ascore_12/eval_o4.ctm.filt.dtl:Percent Total Error = 69.5% (62576) exp/sdm1/tri3a_mmi_b0.1/decode_eval_3.mdl_ami_fsh.o3g.kn.pr1-7/ascore_10/eval_o4.ctm.filt.dtl:Percent Total Error = 67.2% (60447) - - #-------------------------------------------------------------------------------------------------------------------------------------------- #TDNN-online system mic=sdm1 @@ -27,7 +26,7 @@ for x in exp/$mic/nnet2_online/*/decode*; do [ -d $x ] && [[ $x =~ "$1" ]] && gr #Dev %WER 46.8 | 15053 94502 | 59.3 27.6 13.0 6.2 46.8 67.0 | -23.602 | exp/sdm1/nnet2_online/nnet_ms_sp_online/decode_dev_utt/ascore_12/dev_hires_o4.ctm.filt.sys %WER 46.4 | 14210 94496 | 59.0 26.6 14.4 5.4 46.4 70.7 | -23.844 | exp/sdm1/nnet2_online/nnet_ms_sp_online/decode_dev_utt_offline/ascore_13/dev_hires_o4.ctm.filt.sys - + #Eval %WER 50.7 | 13180 89643 | 54.7 29.6 15.7 5.3 50.7 72.6 | -23.104 | exp/sdm1/nnet2_online/nnet_ms_sp_online/decode_eval_utt/ascore_12/eval_hires_o4.ctm.filt.sys %WER 50.5 | 13099 89806 | 54.7 29.3 15.9 5.2 50.5 73.5 | -23.149 | exp/sdm1/nnet2_online/nnet_ms_sp_online/decode_eval_utt_offline/ascore_12/eval_hires_o4.ctm.filt.sys @@ -66,3 +65,83 @@ for x in exp/$mic/nnet2_online/*/decode*; do [ -d $x ] && [[ $x =~ "$1" ]] && gr #epoch 4 %WER 49.1 | 13948 89977 | 55.6 25.2 19.2 4.8 49.1 68.2 | -23.902 | exp/sdm1/nnet2_online/nnet_ms_sp_smbr_0.000005/decode_epoch4_eval_utt/ascore_15/eval_hires_o4.ctm.filt.sys %WER 49.0 | 14259 89798 | 55.8 25.4 18.8 4.8 49.0 66.6 | -23.873 | exp/sdm1/nnet2_online/nnet_ms_sp_smbr_0.000005/decode_epoch4_eval_utt_offline/ascore_15/eval_hires_o4.ctm.filt.sys + + +#------------------------------------------------------------------------------------------------------------------------------------ +# Nnet3 systems +# the ivectors are estimated per recording (not per utterance), the results will be updated with per-utterance ivectors + +#TDNN +#Total training time is 5:19:19 +# local/nnet3/run_tdnn.sh --mic sdm1 --use-sat-alignments false +%WER 46.1 | 15377 94333 | 59.1 25.8 15.0 5.2 46.1 65.6 | -24.026 | exp/sdm1/nnet3/tdnn_sp/decode_dev/ascore_11/dev_hires_o4.ctm.filt.sys +%WER 50.9 | 13867 89975 | 53.6 27.9 18.5 4.5 50.9 70.1 | -23.332 | exp/sdm1/nnet3/tdnn_sp/decode_eval/ascore_11/eval_hires_o4.ctm.filt.sys + +#LSTM +#Total training time is 21:34:06 +%WER 44.2 | 14069 94507 | 61.3 25.8 12.9 5.5 44.2 70.7 | -24.180 | exp/sdm1/nnet3/lstm_sp_ld5/decode_dev/ascore_10/dev_hires_o4.ctm.filt.sys +%WER 47.6 | 14034 89978 | 56.8 26.9 16.4 4.3 47.6 67.7 | -23.786 | exp/sdm1/nnet3/lstm_sp_ld5/decode_eval/ascore_10/eval_hires_o4.ctm.filt.sys + +#Variable-delay LSTM (Default LSTM recipe) +#Total training time is 18:43:35 +# local/nnet3/run_lstm.sh --mic sdm1 --use-sat-alignments false +%WER 44.4 | 14208 94318 | 61.2 25.7 13.1 5.5 44.4 70.1 | -24.197 | exp/sdm1/nnet3/lstm_sp_ld5/decode_dev/ascore_10/dev_hires_o4.ctm.filt.sys +%WER 47.9 | 14766 89956 | 56.0 25.7 18.2 3.9 47.9 64.1 | -23.997 | exp/sdm1/nnet3/lstm_sp_ld5/decode_eval/ascore_10/eval_hires_o4.ctm.filt.sys + +# BLSTM +# local/nnet3/run_blstm.sh --mic sdm1 --use-sat-alignments false +%WER 42.8 | 14948 94501 | 62.2 25.2 12.6 5.1 42.8 65.8 | -24.499 | exp/sdm1/nnet3/lstm_sp_bidirectional_ld0/decode_dev/ascore_10/dev_hires_o4.ctm.filt.sys +%WER 46.1 | 13760 89981 | 57.8 25.9 16.3 3.9 46.1 68.2 | -24.143 | exp/sdm1/nnet3/lstm_sp_bidirectional_ld0/decode_eval/ascore_10/eval_hires_o4.ctm.filt.sys + +# local/nnet3/run_blstm.sh --mic sdm1 --use-sat-alignments true +%WER 42.5 | 14150 94510 | 62.4 24.6 12.9 4.9 42.5 69.2 | -24.676 | exp/sdm1/nnet3/lstm_sp_bidirectional_fmllr_ld0/decode_dev/ascore_10/dev_hires_o4.ctm.filt.sys +%WER 45.6 | 14142 89993 | 58.5 26.1 15.4 4.2 45.6 66.5 | -24.127 | exp/sdm1/nnet3/lstm_sp_bidirectional_fmllr_ld0/decode_eval/ascore_9/eval_hires_o4.ctm.filt.sys + +# BLSTM + clean alignments +# local/nnet3/run_blstm.sh --mic sdm1 \ +# --chunk-right-context 20 \ +# --use-sat-alignments true \ +# --use-ihm-ali true +# Note: Chunk right context of 20 limits the latency of the acoustic model to +# 20 frames. + +%WER 38.5 | 14828 94514 | 66.6 22.7 10.6 5.2 38.5 63.7 | -25.569 | exp/sdm1_cleanali/nnet3/lstm_sp_bidirectional_ld0/decode_dev/ascore_11/dev_hires_o4.ctm.filt.sys +%WER 41.8 | 12828 89977 | 62.5 24.6 12.9 4.3 41.8 70.8 | -24.813 | exp/sdm1_cleanali/nnet3/lstm_sp_bidirectional_ld0/decode_eval/ascore_10/eval_hires_o4.ctm.filt.sys + +################################## +# chain model results + # local/chain/run_tdnn_ami_5.sh --mic sdm1 --affix msl1.5_45wer + %WER 42.8 | 14391 94487 | 60.8 19.3 19.9 3.6 42.8 69.1 | 0.588 | exp/sdm1/chain/tdnn_ami4_msl1.5_45wer_sp/decode_dev/ascore_9/dev_hires_o4.ctm.filt.sys + %WER 46.1 | 13754 89977 | 57.5 20.7 21.9 3.6 46.1 69.2 | 0.561 | exp/sdm1/chain/tdnn_ami4_msl1.5_45wer_sp/decode_eval/ascore_9/eval_hires_o4.ctm.filt.sys + + + # local/chain/run_tdnn_ami_5.sh --mic sdm1 --use-ihm-ali true --max-wer 50 --affix msl1.5_50wer + %WER 41.6 | 14793 94504 | 61.8 19.3 18.9 3.4 41.6 65.3 | 0.591 | exp/sdm1_cleanali/chain/tdnn_ami4_msl1.5_50wer_sp/decode_dev/ascore_9/dev_hires_o4.ctm.filt.sys + %WER 45.4 | 14141 89972 | 57.9 20.7 21.4 3.3 45.4 64.8 | 0.567 | exp/sdm1_cleanali/chain/tdnn_ami4_msl1.5_50wer_sp/decode_eval/ascore_9/eval_hires_o4.ctm.filt.sys + +#-------------------------------------------------------------------------------------------------------------------------------------------- +# Karel, JSALT 2015, (28.7.2015) +# nnet1, MFCC-LDA-MLLT-DNN system (local/nnet/run_dnn_lda_mllt.sh), + +# dev, +## GMM +%WER 66.8 | 14238 94527 | 40.1 40.4 19.5 6.8 66.8 76.1 | -22.367 | exp/sdm1/tri2a/decode_dev_ami_fsh.o3g.kn.pr1-7/ascore_12/dev_o4.ctm.filt.sys +%WER 64.4 | 14843 94511 | 42.1 38.7 19.2 6.5 64.4 72.2 | -22.275 | exp/sdm1/tri3a/decode_dev_ami_fsh.o3g.kn.pr1-7/ascore_12/dev_o4.ctm.filt.sys +%WER 62.3 | 14761 94499 | 44.0 35.7 20.3 6.3 62.3 72.4 | -22.262 | exp/sdm1/tri3a_mmi_b0.1/decode_dev_2.mdl_ami_fsh.o3g.kn.pr1-7/ascore_10/dev_o4.ctm.filt.sys +## DNN-Xent, +%WER 54.0 | 14017 94513 | 51.7 32.3 15.9 5.7 54.0 73.8 | -22.649 | exp/sdm1/dnn4noSAT_pretrain-dbn_dnn/decode_dev_ami_fsh.o3g.kn.pr1-7/ascore_13/dev_o4.ctm.filt.sys +## DNN-sMBR, +%WER 51.6 | 15097 94506 | 54.5 29.8 15.7 6.1 51.6 67.5 | -22.989 | exp/sdm1/dnn4noSAT_pretrain-dbn_dnn_smbr/decode_dev_ami_fsh.o3g.kn.pr1-7_it1/ascore_11/dev_o4.ctm.filt.sys +%WER 50.6 | 14806 94481 | 55.4 29.6 15.0 6.0 50.6 68.7 | -23.087 | exp/sdm1/dnn4noSAT_pretrain-dbn_dnn_smbr/decode_dev_ami_fsh.o3g.kn.pr1-7_it4/ascore_14/dev_o4.ctm.filt.sys + +# eval, +## GMM, +%WER 71.8 | 13901 89999 | 33.9 41.6 24.5 5.7 71.8 74.4 | -22.720 | exp/sdm1/tri2a/decode_eval_ami_fsh.o3g.kn.pr1-7/ascore_12/eval_o4.ctm.filt.sys +%WER 69.5 | 13480 89988 | 36.0 39.6 24.4 5.5 69.5 76.3 | -22.469 | exp/sdm1/tri3a/decode_eval_ami_fsh.o3g.kn.pr1-7/ascore_12/eval_o4.ctm.filt.sys +%WER 67.2 | 13704 89979 | 38.1 36.6 25.3 5.3 67.2 73.7 | -22.292 | exp/sdm1/tri3a_mmi_b0.1/decode_eval_2.mdl_ami_fsh.o3g.kn.pr1-7/ascore_10/eval_o4.ctm.filt.sys +## DNN-Xent, +%WER 58.6 | 14191 89646 | 46.7 34.8 18.6 5.3 58.6 69.2 | -22.351 | exp/sdm1/dnn4noSAT_pretrain-dbn_dnn/decode_eval_ami_fsh.o3g.kn.pr1-7/ascore_12/eval_o4.ctm.filt.sys +## DNN-sMBR, +%WER 56.4 | 14203 89973 | 48.8 31.7 19.5 5.2 56.4 68.8 | -22.584 | exp/sdm1/dnn4noSAT_pretrain-dbn_dnn_smbr/decode_eval_ami_fsh.o3g.kn.pr1-7_it1/ascore_11/eval_o4.ctm.filt.sys +%WER 55.0 | 13731 89834 | 50.7 32.6 16.6 5.8 55.0 70.7 | -22.580 | exp/sdm1/dnn4noSAT_pretrain-dbn_dnn_smbr/decode_eval_ami_fsh.o3g.kn.pr1-7_it4/ascore_13/eval_o4.ctm.filt.sys + diff --git a/egs/ami/s5/cmd.sh b/egs/ami/s5/cmd.sh index e9899d582f6..5ec5d4b715f 100644 --- a/egs/ami/s5/cmd.sh +++ b/egs/ami/s5/cmd.sh @@ -1,9 +1,24 @@ -# "queue.pl" uses qsub. The options to it are -# options to qsub. If you have GridEngine installed, -# change this to a queue you have access to. -# Otherwise, use "run.pl", which will run jobs locally -# (make sure your --num-jobs options are no more than -# the number of cpus on your machine. +# you can change cmd.sh depending on what type of queue you are using. +# If you have no queueing system and want to run on a local machine, you +# can change all instances 'queue.pl' to run.pl (but be careful and run +# commands one by one: most recipes will exhaust the memory on your +# machine). queue.pl works with GridEngine (qsub). slurm.pl works +# with slurm. Different queues are configured differently, with different +# queue names and different ways of specifying things like memory; +# to account for these differences you can create and edit the file +# conf/queue.conf to match your queue's configuration. Search for +# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, +# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. + +export train_cmd="queue.pl --mem 1G" +export decode_cmd="queue.pl --mem 2G" +# the use of cuda_cmd is deprecated but it is sometimes still used in nnet1 +# scripts. +export cuda_cmd="queue.pl --gpu 1 --mem 20G" + +# the rest of this file is present for historical reasons. +# In general it's best to rely on conf/queue.conf for cluster-specific +# configuration. # On Eddie use: #export train_cmd="queue.pl -P inf_hcrc_cstr_nst -l h_rt=08:00:00" @@ -11,29 +26,13 @@ #export highmem_cmd="queue.pl -P inf_hcrc_cstr_nst -l h_rt=05:00:00 -pe memory-2G 4" #export scoring_cmd="queue.pl -P inf_hcrc_cstr_nst -l h_rt=00:20:00" -# JSALT2015 workshop, cluster AWS-EC2, (setup from Vijay) -#export train_cmd="queue.pl -l arch=*64*" -export train_cmd="queue.pl -l arch=*64*" -export decode_cmd="queue.pl -l arch=*64* --mem 4G" -export highmem_cmd="queue.pl -l arch=*64* --mem 4G" -export scoring_cmd="queue.pl -l arch=*64*" -export cuda_cmd="queue.pl --gpu 1 -l mem_free=20G,ram_free=20G" -#export cuda_cmd="run.pl" -export cntk_decode_cmd="queue.pl -l arch=*64* --mem 1G -pe smp 2" - -# To run locally, use: -#export train_cmd=run.pl -#export decode_cmd=run.pl -#export highmem_cmd=run.pl -#export cuda_cmd=run.pl - if [ "$(hostname -d)" == "fit.vutbr.cz" ]; then # BUT cluster: queue="all.q@@blade,all.q@@speech" - gpu_queue="long.q@supergpu*,long.q@dellgpu*,long.q@pcspeech-gpu,long.q@pcgpu*" + gpu_queue="long.q@@gpu" storage="matylda5" - export train_cmd="queue.pl -q $queue -l ram_free=1500M,mem_free=1500M,${storage}=1" - export decode_cmd="queue.pl -q $queue -l ram_free=2500M,mem_free=2500M,${storage}=0.5" + export train_cmd="queue.pl -q $queue -l ram_free=1.5G,mem_free=1.5G,${storage}=1" + export decode_cmd="queue.pl -q $queue -l ram_free=2.5G,mem_free=2.5G,${storage}=0.5" export cuda_cmd="queue.pl -q $gpu_queue -l gpu=1" -fi +fi diff --git a/egs/ami/s5/conf/queue_jsalt.conf b/egs/ami/s5/conf/queue_jsalt.conf new file mode 100644 index 00000000000..6cda84f912a --- /dev/null +++ b/egs/ami/s5/conf/queue_jsalt.conf @@ -0,0 +1,11 @@ +# Origin at : http://wiki.clsp.jhu.edu/view/Ws15_AWS_Kluster_Rules +# configuration for the AWS cluster for WS'15. +command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64* +option mem=* -l mem_free=$0,ram_free=$0 +option mem=0 # Do not add anything to qsub_opts +option num_threads=* -pe smp $0 +option num_threads=1 # Do not add anything to qsub_opts +option max_jobs_run=* -tc $0 +default gpu=0 +option gpu=0 +option gpu=1 -q gpu.q diff --git a/egs/ami/s5/local/ami_beamform.sh b/egs/ami/s5/local/ami_beamform.sh index 419e67c74d2..b5ff8c23ba8 100755 --- a/egs/ami/s5/local/ami_beamform.sh +++ b/egs/ami/s5/local/ami_beamform.sh @@ -34,6 +34,8 @@ set -u mkdir -p $odir mkdir -p $wdir/log +[ -e $odir/.done_beamforming ] && echo "Beamforming already done, skipping..." && exit 0 + meetings=$wdir/meetings.list cat local/split_train.orig local/split_dev.orig local/split_eval.orig | sort > $meetings @@ -74,3 +76,4 @@ echo -e "Beamforming\n" $cmd JOB=1:$nj $wdir/log/beamform.JOB.log \ local/beamformit.sh $nj JOB $numch $meetings $sdir $odir +touch $odir/.done_beamforming diff --git a/egs/ami/s5/local/ami_download.sh b/egs/ami/s5/local/ami_download.sh index 3a2a0c5c0fe..b14f8550c75 100755 --- a/egs/ami/s5/local/ami_download.sh +++ b/egs/ami/s5/local/ami_download.sh @@ -53,8 +53,8 @@ cat local/split_train.orig local/split_eval.orig local/split_dev.orig > $wdir/am wgetfile=$wdir/wget_$mic.sh # TODO fix this with Pawel, files don't exist anymore, -manifest="wget -O $adir/MANIFEST.TXT http://groups.inf.ed.ac.uk/ami/download/temp/amiBuild-04237-Sun-Jun-15-2014.manifest.txt" -license="wget -O $adir/LICENCE.TXT http://groups.inf.ed.ac.uk/ami/download/temp/Creative-Commons-Attribution-NonCommercial-ShareAlike-2.5.txt" +manifest="wget --continue -O $adir/MANIFEST.TXT http://groups.inf.ed.ac.uk/ami/download/temp/amiBuild-04237-Sun-Jun-15-2014.manifest.txt" +license="wget --continue -O $adir/LICENCE.TXT http://groups.inf.ed.ac.uk/ami/download/temp/Creative-Commons-Attribution-NonCommercial-ShareAlike-2.5.txt" echo "#!/bin/bash" > $wgetfile echo $manifest >> $wgetfile diff --git a/egs/ami/s5/local/ami_format_data.sh b/egs/ami/s5/local/ami_format_data.sh index fda2d498137..91cd619f574 100755 --- a/egs/ami/s5/local/ami_format_data.sh +++ b/egs/ami/s5/local/ami_format_data.sh @@ -1,4 +1,4 @@ -#!/bin/bash +#!/bin/bash # if [ -f path.sh ]; then . path.sh; fi @@ -15,25 +15,12 @@ arpa_lm=$1 cp -r data/lang data/lang_test -# grep -v ' ' etc. is only for future-proofing this script. Our -# LM doesn't have these "invalid combinations". These can cause -# determinization failures of CLG [ends up being epsilon cycles]. -# Note: remove_oovs.pl takes a list of words in the LM that aren't in -# our word list. Since our LM doesn't have any, we just give it -# /dev/null [we leave it in the script to show how you'd do it]. gunzip -c "$arpa_lm" | \ - grep -v ' ' | \ - grep -v ' ' | \ - grep -v ' ' | \ - arpa2fst - | fstprint | \ - utils/remove_oovs.pl /dev/null | \ - utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=data/lang_test/words.txt \ - --osymbols=data/lang_test/words.txt --keep_isymbols=false --keep_osymbols=false | \ - fstrmepsilon | fstarcsort --sort_type=ilabel > data/lang_test/G.fst - fstisstochastic data/lang_test/G.fst + arpa2fst --disambig-symbol=#0 \ + --read-symbol-table=data/lang_test/words.txt - data/lang_test/G.fst echo "Checking how stochastic G is (the first of these numbers should be small):" -fstisstochastic data/lang_test/G.fst +fstisstochastic data/lang_test/G.fst ## Check lexicon. ## just have a look and make sure it seems sane. @@ -61,4 +48,3 @@ fsttablecompose data/lang/L_disambig.fst data/lang_test/G.fst | \ fstisstochastic || echo LG is not stochastic echo AMI_format_data succeeded. - diff --git a/egs/ami/s5/local/ami_ihm_scoring_data_prep.sh b/egs/ami/s5/local/ami_ihm_scoring_data_prep.sh index a0cca9c5f8e..c3b9914d7a0 100755 --- a/egs/ami/s5/local/ami_ihm_scoring_data_prep.sh +++ b/egs/ami/s5/local/ami_ihm_scoring_data_prep.sh @@ -84,7 +84,7 @@ sort -k 2 $dir/utt2spk | utils/utt2spk_to_spk2utt.pl > $dir/spk2utt || exit 1; #check and correct the case when segment timings for given speaker overlap themself #(important for simulatenous asclite scoring to proceed). #There is actually only one such case for devset and automatic segmentetions -join $dir/utt2spkm $dir/segments | \ +join $dir/utt2spk $dir/segments | \ perl -ne '{BEGIN{$pu=""; $pt=0.0;} split; if ($pu eq $_[1] && $pt > $_[3]) { print "$_[0] $_[2] $_[3] $_[4]>$_[0] $_[2] $pt $_[4]\n" diff --git a/egs/ami/s5/local/ami_mdm_scoring_data_prep.sh b/egs/ami/s5/local/ami_mdm_scoring_data_prep.sh index 406add86bca..ab0fd185f70 100755 --- a/egs/ami/s5/local/ami_mdm_scoring_data_prep.sh +++ b/egs/ami/s5/local/ami_mdm_scoring_data_prep.sh @@ -1,15 +1,15 @@ #!/bin/bash # Copyright 2014, University of Edinburgh (Author: Pawel Swietojanski) -# AMI Corpus dev/eval data preparation +# AMI Corpus dev/eval data preparation . path.sh #check existing directories if [ $# != 3 ]; then echo "Usage: ami_mdm_scoring_data_prep.sh /path/to/AMI-MDM mic-name set-name" - exit 1; -fi + exit 1; +fi AMI_DIR=$1 mic=$2 @@ -24,8 +24,8 @@ mkdir -p $tmpdir # Audio data directory check if [ ! -d $AMI_DIR ]; then echo "Error: run.sh requires a directory argument" - exit 1; -fi + exit 1; +fi # And transcripts check if [ ! -f $SEGS ]; then @@ -48,7 +48,7 @@ awk '{meeting=$1; channel="MDM"; speaker=$3; stime=$4; etime=$5; # (1c) Make segment files from transcript #segments file format is: utt-id side-id start-time end-time, e.g.: #AMI_ES2011a_H00_FEE041_0003415_0003484 -awk '{ +awk '{ segment=$1; split(segment,S,"[_]"); audioname=S[1]"_"S[2]"_"S[3]; startf=S[5]; endf=S[6]; @@ -71,12 +71,12 @@ awk '{print $1" sox -c 1 -t wavpcm -s "$2" -t wavpcm - |"}' $tmpdir/wav2.scp > $ #prep reco2file_and_channel cat $tmpdir/wav.scp | \ - perl -ane '$_ =~ m:^(\S+MDM)\s+.*\/([IETB].*)\.wav.*$: || die "bad label $_"; + perl -ane '$_ =~ m:^(\S+MDM)\s+.*\/([IETB].*)\.wav.*$: || die "bad label $_"; print "$1 $2 A\n"; ' > $tmpdir/reco2file_and_channel || exit 1; # we assume we adapt to the session only awk '{print $1}' $tmpdir/segments | \ - perl -ane '$_ =~ m:^(\S+)([FM][A-Z]{0,2}[0-9]{3}[A-Z]*)(\S+)$: || die "bad label $_"; + perl -ane '$_ =~ m:^(\S+)([FM][A-Z]{0,2}[0-9]{3}[A-Z]*)(\S+)$: || die "bad label $_"; print "$1$2$3 $1\n";' \ > $tmpdir/utt2spk || exit 1; @@ -85,26 +85,27 @@ sort -k 2 $tmpdir/utt2spk | utils/utt2spk_to_spk2utt.pl > $tmpdir/spk2utt || exi # but we want to properly score the overlapped segments, hence we generate the extra # utt2spk_stm file containing speakers ids used to generate the stms for mdm/sdm case awk '{print $1}' $tmpdir/segments | \ - perl -ane '$_ =~ m:^(\S+)([FM][A-Z]{0,2}[0-9]{3}[A-Z]*)(\S+)$: || die "bad label $_"; + perl -ane '$_ =~ m:^(\S+)([FM][A-Z]{0,2}[0-9]{3}[A-Z]*)(\S+)$: || die "bad label $_"; print "$1$2$3 $1$2\n";' > $tmpdir/utt2spk_stm || exit 1; #check and correct case when segment timings for a given speaker overlap themself #(important for simulatenous asclite scoring to proceed). #There is actually only one such case for devset and automatic segmentetions join $tmpdir/utt2spk_stm $tmpdir/segments | \ - perl -ne '{BEGIN{$pu=""; $pt=0.0;} split; - if ($pu eq $_[1] && $pt > $_[3]) { - print "$_[0] $_[2] $_[3] $_[4]>$_[0] $_[2] $pt $_[4]\n" - } - $pu=$_[1]; $pt=$_[4]; - }' > $tmpdir/segments_to_fix + awk '{ utt=$1; spk=$2; wav=$3; t_beg=$4; t_end=$5; + if(spk_prev == spk && t_end_prev > t_beg) { + print utt, wav, t_beg, t_end">"utt, wav, t_end_prev, t_end; + } + spk_prev=spk; t_end_prev=t_end; + }' > $tmpdir/segments_to_fix + if [ `cat $tmpdir/segments_to_fix | wc -l` -gt 0 ]; then echo "$0. Applying following fixes to segments" cat $tmpdir/segments_to_fix while read line; do p1=`echo $line | awk -F'>' '{print $1}'` p2=`echo $line | awk -F'>' '{print $2}'` - sed -ir "s!$p1!$p2!" $tmpdir/segments + sed -ir "s:$p1:$p2:" $tmpdir/segments done < $tmpdir/segments_to_fix fi diff --git a/egs/ami/s5/local/ami_sdm_scoring_data_prep.sh b/egs/ami/s5/local/ami_sdm_scoring_data_prep.sh index 90690731ec9..01173d2e3a6 100755 --- a/egs/ami/s5/local/ami_sdm_scoring_data_prep.sh +++ b/egs/ami/s5/local/ami_sdm_scoring_data_prep.sh @@ -1,15 +1,15 @@ #!/bin/bash # Copyright 2014, University of Edinburgh (Author: Pawel Swietojanski) -# AMI Corpus dev/eval data preparation +# AMI Corpus dev/eval data preparation . path.sh #check existing directories if [ $# != 3 ]; then echo "Usage: ami_sdm_scoring_data_prep.sh " - exit 1; -fi + exit 1; +fi AMI_DIR=$1 MICNUM=$2 @@ -25,8 +25,8 @@ mkdir -p $tmpdir # Audio data directory check if [ ! -d $AMI_DIR ]; then echo "Error: run.sh requires a directory argument" - exit 1; -fi + exit 1; +fi # And transcripts check if [ ! -f $SEGS ]; then @@ -53,7 +53,7 @@ awk '{meeting=$1; channel="SDM"; speaker=$3; stime=$4; etime=$5; # (1c) Make segment files from transcript #segments file format is: utt-id side-id start-time end-time, e.g.: #AMI_ES2011a_H00_FEE041_0003415_0003484 -awk '{ +awk '{ segment=$1; split(segment,S,"[_]"); audioname=S[1]"_"S[2]"_"S[3]; startf=S[5]; endf=S[6]; @@ -76,13 +76,13 @@ awk '{print $1" sox -c 1 -t wavpcm -s "$2" -t wavpcm - |"}' $tmpdir/wav2.scp > $ #prep reco2file_and_channel cat $tmpdir/wav.scp | \ - perl -ane '$_ =~ m:^(\S+SDM).*\/([IETB].*)\.wav.*$: || die "bad label $_"; + perl -ane '$_ =~ m:^(\S+SDM).*\/([IETB].*)\.wav.*$: || die "bad label $_"; print "$1 $2 A\n"; '\ > $tmpdir/reco2file_and_channel || exit 1; # we assume we adapt to the session only awk '{print $1}' $tmpdir/segments | \ - perl -ane '$_ =~ m:^(\S+)([FM][A-Z]{0,2}[0-9]{3}[A-Z]*)(\S+)$: || die "bad label $_"; + perl -ane '$_ =~ m:^(\S+)([FM][A-Z]{0,2}[0-9]{3}[A-Z]*)(\S+)$: || die "bad label $_"; print "$1$2$3 $1\n";' \ > $tmpdir/utt2spk || exit 1; @@ -91,27 +91,28 @@ sort -k 2 $tmpdir/utt2spk | utils/utt2spk_to_spk2utt.pl > $tmpdir/spk2utt || exi # but we want to properly score the overlapped segments, hence we generate the extra # utt2spk_stm file containing speakers ids used to generate the stms for mdm/sdm case awk '{print $1}' $tmpdir/segments | \ - perl -ane '$_ =~ m:^(\S+)([FM][A-Z]{0,2}[0-9]{3}[A-Z]*)(\S+)$: || die "bad label $_"; + perl -ane '$_ =~ m:^(\S+)([FM][A-Z]{0,2}[0-9]{3}[A-Z]*)(\S+)$: || die "bad label $_"; print "$1$2$3 $1$2\n";' \ > $tmpdir/utt2spk_stm || exit 1; -#check and correct the case when segment timings for given speaker overlap themself +#check and correct the case when segment timings for given speaker overlap themself #(important for simulatenous asclite scoring to proceed). #There is actually only one such case for devset and automatic segmentetions join $tmpdir/utt2spk_stm $tmpdir/segments | \ - perl -ne '{BEGIN{$pu=""; $pt=0.0;} split; - if ($pu eq $_[1] && $pt > $_[3]) { - print "$_[0] $_[2] $_[3] $_[4]>$_[0] $_[2] $pt $_[4]\n" - } - $pu=$_[1]; $pt=$_[4]; - }' > $tmpdir/segments_to_fix + awk '{ utt=$1; spk=$2; wav=$3; t_beg=$4; t_end=$5; + if(spk_prev == spk && t_end_prev > t_beg) { + print utt, wav, t_beg, t_end">"utt, wav, t_end_prev, t_end; + } + spk_prev=spk; t_end_prev=t_end; + }' > $tmpdir/segments_to_fix + if [ `cat $tmpdir/segments_to_fix | wc -l` -gt 0 ]; then echo "$0. Applying following fixes to segments" cat $tmpdir/segments_to_fix while read line; do p1=`echo $line | awk -F'>' '{print $1}'` p2=`echo $line | awk -F'>' '{print $2}'` - sed -ir "s!$p1!$p2!" $tmpdir/segments + sed -ir "s:$p1:$p2:" $tmpdir/segments done < $tmpdir/segments_to_fix fi diff --git a/egs/ami/s5/local/ami_text_prep.sh b/egs/ami/s5/local/ami_text_prep.sh index 0b87d10e4de..777c3d8b086 100755 --- a/egs/ami/s5/local/ami_text_prep.sh +++ b/egs/ami/s5/local/ami_text_prep.sh @@ -9,29 +9,30 @@ if [ $# -ne 1 ]; then exit 1; fi -set -e -set -u +set -eux -amidir=$1 -mkdir -p $amidir +dir=$1 +mkdir -p $dir -echo "Downloading annotiations..." +echo "Downloading annotations..." amiurl=http://groups.inf.ed.ac.uk/ami annotver=ami_public_manual_1.6.1 -annot="$amidir/$annotver" +annot="$dir/$annotver" logdir=data/local/downloads; mkdir -p $logdir/log [ ! -f $annot.zip ] && wget -nv -O $annot.zip $amiurl/AMICorpusAnnotations/$annotver.zip &> $logdir/log/download_ami_annot.log -mkdir -p $amidir/annotations -unzip -o -d $amidir/annotations $annot.zip &> /dev/null +if [ ! -d $dir/annotations ]; then + mkdir -p $dir/annotations + unzip -o -d $dir/annotations $annot.zip &> /dev/null +fi -[ ! -f "$amidir/annotations/AMI-metadata.xml" ] && echo "$0: File AMI-Metadata.xml not found under $amidir/annotations." && exit 1; +[ ! -f "$dir/annotations/AMI-metadata.xml" ] && echo "$0: File AMI-Metadata.xml not found under $dir/annotations." && exit 1; # extract text from AMI XML annotations, -local/ami_xml2text.sh $amidir +local/ami_xml2text.sh $dir wdir=data/local/annotations [ ! -f $wdir/transcripts1 ] && echo "$0: File $wdir/transcripts1 not found." && exit 1; @@ -39,7 +40,7 @@ wdir=data/local/annotations echo "Preprocessing transcripts..." local/ami_split_segments.pl $wdir/transcripts1 $wdir/transcripts2 &> $wdir/log/split_segments.log -#make final train/dev/eval splits +# make final train/dev/eval splits for dset in train eval dev; do [ ! -f local/split_$dset.final ] && cp local/split_$dset.orig local/split_$dset.final grep -f local/split_$dset.final $wdir/transcripts2 > $wdir/$dset.txt diff --git a/egs/ami/s5/local/ami_xml2text.sh b/egs/ami/s5/local/ami_xml2text.sh index 4d5431c6a4d..c4b90a33702 100755 --- a/egs/ami/s5/local/ami_xml2text.sh +++ b/egs/ami/s5/local/ami_xml2text.sh @@ -19,7 +19,8 @@ JAVA_VER=$(java -version 2>&1 | sed 's/java version "\(.*\)\.\(.*\)\..*"/\1\2/; if [ "$JAVA_VER" -ge 15 ]; then if [ ! -d $wdir/nxt ]; then echo "Downloading NXT annotation tool..." - wget -O $wdir/nxt.zip http://sourceforge.net/projects/nite/files/nite/nxt_1.4.4/nxt_1.4.4.zip &> /dev/null + wget -O $wdir/nxt.zip http://sourceforge.net/projects/nite/files/nite/nxt_1.4.4/nxt_1.4.4.zip + [ ! -s $wdir/nxt.zip ] && echo "Downloading failed! ($wdir/nxt.zip)" && exit 1 unzip -d $wdir/nxt $wdir/nxt.zip &> /dev/null fi diff --git a/egs/ami/s5/local/chain/run_blstm_ami_5.sh b/egs/ami/s5/local/chain/run_blstm_ami_5.sh new file mode 100755 index 00000000000..d9437af7e0c --- /dev/null +++ b/egs/ami/s5/local/chain/run_blstm_ami_5.sh @@ -0,0 +1,178 @@ +#!/bin/bash + + +### +# Does not give improvements over xent+blstm system !! +#local/chain/run_blstm_ami_5.sh --mic sdm1 --use-ihm-ali false --max-wer 45 --affix msl1.5_45wer +# %WER 42.5 | 14769 94491 | 61.0 19.9 19.1 3.5 42.5 67.5 | 0.605 | exp/sdm1/chain/blstm_ami5_msl1.5_45wer_sp/decode_dev/ascore_9/dev_hires_o4.ctm.filt.sys +# %WER 45.7 | 13674 89971 | 57.7 21.0 21.3 3.5 45.7 69.1 | 0.572 | exp/sdm1/chain/blstm_ami5_msl1.5_45wer_sp/decode_eval/ascore_9/eval_hires_o4.ctm.filt.sys + +set -e + +# configs for 'chain' +stage=10 +train_stage=-10 +get_egs_stage=-10 +decode_stage=1 +mic=ihm +use_ihm_ali=false +affix= +common_egs_dir= +exp_name=blstm_ami5 + +# LSTM options +chunk_width=150 +chunk_left_context=40 +chunk_right_context=40 + + +# decode options +extra_left_context= +extra_right_context= +frames_per_chunk= + +# training options +# chain options +xent_regularize=0.1 +max_wer=45 + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate -1 \ + --cmd "$train_cmd" 4200 data/$mic/$latgen_train_set $lang $ali_dir $treedir + +fi + +# combining the segments in training data to have a minimum length of frames_per_eg + tolerance +# this is critical stage in AMI (gives 1% absolute improvement) +if [ -z $min_seg_len ]; then + min_seg_len=$(python -c "print ($frames_per_eg+5)/100.0") +fi + +if [ $stage -le 12 ]; then + rm -rf data/$mic/${train_set}_min${min_seg_len}_hires + steps/cleanup/combine_short_segments.py --minimum-duration $min_seg_len \ + --input-data-dir data/$mic/${train_set}_hires \ + --output-data-dir data/$mic/${train_set}_min${min_seg_len}_hires + + #extract ivectors for the new data + steps/online/nnet2/copy_data_dir.sh --utts-per-spk-max 2 \ + data/$mic/${train_set}_min${min_seg_len}_hires data/$mic/${train_set}_min${min_seg_len}_hires_max2 + steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 30 \ + data/$mic/${train_set}_min${min_seg_len}_hires_max2 \ + exp/$mic/nnet3/extractor \ + exp/$mic/nnet3/ivectors_${train_set}_min${min_seg_len} || exit 1; + + # combine the non-hires features for alignments/lattices + rm -rf data/$mic/${latgen_train_set}_min${min_seg_len} + steps/cleanup/combine_short_segments.py --minimum-duration $min_seg_len \ + --input-data-dir data/$mic/${latgen_train_set} \ + --output-data-dir data/$mic/${latgen_train_set}_min${min_seg_len} +fi + +train_set=${train_set}_min${min_seg_len} +latgen_train_set=${latgen_train_set}_min${min_seg_len} +ivector_dir=exp/$mic/nnet3/ivectors_${train_set} +ali_dir=${ali_dir}_min${min_seg_len} +lat_dir=${lat_dir}_min${min_seg_len} +if [ $stage -le 13 ]; then + # realigning data as the segments would have changed + steps/align_fmllr.sh --nj 100 --cmd "$train_cmd" data/$mic/$latgen_train_set data/lang $gmm_dir $ali_dir || exit 1; +fi + +if [ $stage -le 14 ]; then + # Get the alignments as lattices (gives the chain training more freedom). + # use the same num-jobs as the alignments + steps/align_fmllr_lats.sh --nj 100 --cmd "$train_cmd" data/$mic/$latgen_train_set \ + data/lang $gmm_dir $lat_dir + rm $lat_dir/fsts.*.gz # save space +fi + +mkdir -p $dir +train_data_dir=data/$mic/${train_set}_hires +if [ ! -z $max_wer ]; then + if [ $stage -le 15 ]; then + bad_utts_dir=${gmm_dir}_${mic}_${train_set}_bad_utts # added mic in name as this can be ihm directory where parallel mdm and sdm utts are written + if [ ! -f $bad_utts_dir/all_info.sorted.txt ]; then + # This stage takes a lot of time ~7hrs, so run only if file is not available already + steps/cleanup/find_bad_utts.sh --cmd "$decode_cmd" --nj 405 data/$mic/$latgen_train_set data/lang $ali_dir $bad_utts_dir + fi + python local/sort_bad_utts.py --bad-utt-info-file $bad_utts_dir/all_info.sorted.txt --max-wer $max_wer --output-file $dir/wer_sorted_utts_${max_wer}wer + utils/copy_data_dir.sh --validate-opts "--no-wav" data/$mic/${train_set}_hires data/$mic/${train_set}_${max_wer}wer_hires + utils/filter_scp.pl $dir/wer_sorted_utts_${max_wer}wer data/$mic/${train_set}_hires/feats.scp > data/$mic/${train_set}_${max_wer}wer_hires/feats.scp + utils/fix_data_dir.sh data/$mic/${train_set}_${max_wer}wer_hires + fi + train_data_dir=data/$mic/${train_set}_${max_wer}wer_hires + # we don't realign again as the segment ids don't change +fi + +cat > $dir/vars <|%.*|!.*|-.*|.*-)$/; print $0, keep_the_word }' \ + $graph/words.txt >$word_filter + +# Calcualte the word-length, +word_length=$(mktemp) +awk '{if(r==0) { len_hash[$1] = NF-2; } + if(r==1) { if(len_hash[$1]) { len = len_hash[$1]; } else { len = -1 } + print $0, len; }}' \ + r=0 $graph/phones/align_lexicon.txt \ + r=1 $graph/words.txt \ + >$word_length + +# Extract unigrams, +unigrams=$(mktemp); steps/conf/parse_arpa_unigrams.py $graph/words.txt $arpa_gz $unigrams + +###### Paste the 'word-specific' features (first 4 columns have fixed position, more feature-columns can be added), +# Format: "word word_id filter length other_features" +word_feats=$(mktemp) +paste $word_filter <(awk '{ print $3 }' $word_length) <(awk '{ print $3 }' $unigrams) > $word_feats + + +###### Train the calibration, +steps/conf/train_calibration.sh --cmd "$decode_cmd" --lmwt $lmwt \ + $dev_data $graph $word_feats $dev_latdir $dev_caldir + +###### Apply the calibration to eval set, +steps/conf/apply_calibration.sh --cmd "$decode_cmd" \ + $eval_data $graph $eval_latdir $dev_caldir $eval_caldir +# The final confidences are here '$eval_caldir/ctm_calibrated', + +###### Sclite scoring, +# We will produce NCE which shows the ``quality'' of the confidences. +# Please compare with the default scoring script for your database. + +# Scoring tools, +hubscr=$KALDI_ROOT/tools/sctk/bin/hubscr.pl +hubdir=`dirname $hubscr` + +# Inputs, +ctm=$eval_caldir/ctm_calibrated +stm=$eval_data/stm +glm=$eval_data/glm + +# Normalizng CTM, just like in 'local/score_sclite.sh', +cat $ctm | grep -i -v -E '\[NOISE|LAUGHTER|VOCALIZED-NOISE\]' | \ + grep -i -v -E ' (ACH|AH|EEE|EH|ER|EW|HA|HEE|HM|HMM|HUH|MM|OOF|UH|UM) ' | \ + grep -i -v -E '' >${ctm}.filt + +# Mapping the time info to global, +utils/convert_ctm.pl $eval_data/segments $eval_data/reco2file_and_channel <${ctm}.filt >${ctm}.filt.conv + +# Scoring, +$hubscr -p $hubdir -V -l english -h hub5 -g $glm -r $stm ${ctm}.filt.conv diff --git a/egs/ami/s5/local/nnet/run_dnn.sh b/egs/ami/s5/local/nnet/run_dnn.sh index 9e4264cb7f0..c7b9db11acc 100755 --- a/egs/ami/s5/local/nnet/run_dnn.sh +++ b/egs/ami/s5/local/nnet/run_dnn.sh @@ -14,13 +14,13 @@ stage=0 # resume training with --stage=N # if [ $# -ne 1 ]; then - printf "\nUSAGE: %s \n\n" `basename $0` + printf "\nUSAGE: %s [opts] \n\n" `basename $0` exit 1; fi mic=$1 gmmdir=exp/$mic/tri4a -data_fmllr=data-fmllr-tri4 +data_fmllr=data_${mic}-fmllr-tri4 final_lm=`cat data/local/lm/final_lm` LM=$final_lm.pr1-7 @@ -28,10 +28,7 @@ graph_dir=$gmmdir/graph_${LM} # Set bash to 'debug' mode, it will exit on : # -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands', -set -e -set -u -set -o pipefail -set -x +set -euxo pipefail # Store fMLLR features, so we can train on them easily, if [ $stage -le 0 ]; then @@ -102,13 +99,13 @@ if [ $stage -le 4 ]; then steps/nnet/train_mpe.sh --cmd "$cuda_cmd" --num-iters 4 --acwt $acwt --do-smbr true \ $data_fmllr/$mic/train data/lang $srcdir ${srcdir}_ali ${srcdir}_denlats $dir # Decode (reuse HCLG graph) - for ITER in 4 3 2 1; do + for ITER in 4 1; do steps/nnet/decode.sh --nj $nj_decode --cmd "$decode_cmd" --config conf/decode_dnn.conf \ - --num-threads 3 --nnet $dir/${ITER}.nnet --acwt $acwt \ - $graph_dir $data_fmllr/$mic/dev $dir/decode_dev_${LM} + --nnet $dir/${ITER}.nnet --acwt $acwt \ + $graph_dir $data_fmllr/$mic/dev $dir/decode_dev_${LM}_it${ITER} steps/nnet/decode.sh --nj $nj_decode --cmd "$decode_cmd" --config conf/decode_dnn.conf \ - --num-threads 3 --nnet $dir/${ITER}.nnet --acwt $acwt \ - $graph_dir $data_fmllr/$mic/eval $dir/decode_eval_${LM} + --nnet $dir/${ITER}.nnet --acwt $acwt \ + $graph_dir $data_fmllr/$mic/eval $dir/decode_eval_${LM}_it${ITER} done fi diff --git a/egs/ami/s5/local/nnet/run_dnn_lda_mllt.sh b/egs/ami/s5/local/nnet/run_dnn_lda_mllt.sh index 04cc7fe7052..4caf140093d 100755 --- a/egs/ami/s5/local/nnet/run_dnn_lda_mllt.sh +++ b/egs/ami/s5/local/nnet/run_dnn_lda_mllt.sh @@ -14,7 +14,7 @@ stage=0 # resume training with --stage=N # if [ $# -ne 1 ]; then - printf "\nUSAGE: %s \n\n" `basename $0` + printf "\nUSAGE: %s [opts] \n\n" `basename $0` exit 1; fi mic=$1 @@ -50,6 +50,7 @@ if [ $stage -le 1 ]; then # - re-use CMVN options, feat_dim=$(feat-to-dim scp:data/$mic/train/feats.scp -) cmvn_opts=$(cat $gmmdir/cmvn_opts) + [ -z $cmvn_opts ] && cmvn_opts="--norm-means=true --norm-vars=false" # GMM default, { echo " $feat_dim $((feat_dim*7)) [ -3 -2 -1 0 1 2 3 ]" echo " $((feat_dim*7)) 40 $gmmdir/final.mat" @@ -105,13 +106,13 @@ if [ $stage -le 4 ]; then steps/nnet/train_mpe.sh --cmd "$cuda_cmd" --num-iters 4 --acwt $acwt --do-smbr true \ data/$mic/train data/lang $srcdir ${srcdir}_ali ${srcdir}_denlats $dir # Decode (reuse HCLG graph) - for ITER in 4 3 2 1; do + for ITER in 4 1; do steps/nnet/decode.sh --nj $nj_dev --cmd "$decode_cmd" --config conf/decode_dnn.conf \ - --num-threads 3 --nnet $dir/${ITER}.nnet --acwt $acwt \ - $graph_dir data/$mic/dev $dir/decode_dev_${LM} + --nnet $dir/${ITER}.nnet --acwt $acwt \ + $graph_dir data/$mic/dev $dir/decode_dev_${LM}_it${ITER} steps/nnet/decode.sh --nj $nj_eval --cmd "$decode_cmd" --config conf/decode_dnn.conf \ - --num-threads 3 --nnet $dir/${ITER}.nnet --acwt $acwt \ - $graph_dir data/$mic/eval $dir/decode_eval_${LM} + --nnet $dir/${ITER}.nnet --acwt $acwt \ + $graph_dir data/$mic/eval $dir/decode_eval_${LM}_it${ITER} done fi diff --git a/egs/ami/s5/local/nnet3/prepare_parallel_datadirs.sh b/egs/ami/s5/local/nnet3/prepare_parallel_datadirs.sh new file mode 100755 index 00000000000..df069929377 --- /dev/null +++ b/egs/ami/s5/local/nnet3/prepare_parallel_datadirs.sh @@ -0,0 +1,66 @@ +#!/bin/bash + +# this script creates a new data directory data/$new_mic +# where the train, dev and eval directories are copied from $original_mic +# in addition to these a new data directory train_parallel is created which has +# the segment ids from data/$original_mic but the wav data is copied from +# data/$parallel_mic + +original_mic=sdm1 +parallel_mic=ihm +new_mic=sdm1_cleanali + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +#copy the clean data directory and change the segment ids +for dset in train dev eval; do + utils/copy_data_dir.sh data/$original_mic/$dset data/$new_mic/$dset +done +dset=train +utils/copy_data_dir.sh data/$parallel_mic/$dset data/$new_mic/${dset}_parallel +rm -rf data/$new_mic/${dset}_parallel/{text,feats.scp,cmvn.scp} +cp data/$new_mic/$dset/{spk2utt,text,utt2spk} data/$new_mic/${dset}_parallel +cp data/$new_mic/${dset}_parallel/wav.scp data/$new_mic/${dset}_parallel/wav.scp_full +cp data/$new_mic/${dset}_parallel/reco2file_and_channel data/$new_mic/${dset}_parallel/reco2file_and_channel_full + +dset=train +# map sdm/mdm segments to the ihm segments +tmpdir=`mktemp -d ./tmpXXX` +cat data/$parallel_mic/$dset/segments | sed -e "s/_H[0-9][0-9]_//g" > $tmpdir/key2ihm +cat data/$new_mic/$dset/segments | awk '{print $1}' > $tmpdir/dm_utts +mic_basename=$(echo $original_mic | sed -e "s/[0-9]//g") +if [ $mic_basename == "sdm" ]; then + pattern="_SDM_" +else + pattern="_MDM_" +fi +cat $tmpdir/dm_utts | sed -e "s/$pattern//g" > $tmpdir/key +paste -d' ' $tmpdir/key $tmpdir/dm_utts > $tmpdir/key2dm + +python -c " +ihm = dict(map(lambda x: [x.split()[0], ' '.join(x.split()[1:])], open('$tmpdir/key2ihm').readlines())) +dm = dict(map(lambda x: x.split(), open('$tmpdir/key2dm').readlines())) + +keys = ihm.keys() +keys.sort() + +for key in keys : + try: + print '{0} {1}'.format(dm[key], ihm[key]) + except KeyError: + continue +" > data/$new_mic/${dset}_parallel/segments + +cat data/$new_mic/${dset}_parallel/segments | awk '{print $2}' |sort -u > $tmpdir/ids +utils/filter_scp.pl $tmpdir/ids \ + data/$new_mic/${dset}_parallel/wav.scp_full > \ + data/$new_mic/${dset}_parallel/wav.scp + +utils/filter_scp.pl $tmpdir/ids \ + data/$new_mic/${dset}_parallel/reco2file_and_channel_full > \ + data/$new_mic/${dset}_parallel/reco2file_and_channel +utils/fix_data_dir.sh data/$new_mic/${dset}_parallel + +exit 0; diff --git a/egs/ami/s5/local/nnet3/prepare_parallel_perturbed_alignments.sh b/egs/ami/s5/local/nnet3/prepare_parallel_perturbed_alignments.sh new file mode 100755 index 00000000000..4041ecde27e --- /dev/null +++ b/egs/ami/s5/local/nnet3/prepare_parallel_perturbed_alignments.sh @@ -0,0 +1,92 @@ +#!/bin/bash + +# This script creates the parallel data dir based on ihm data, +# creates speed perturbed versions of this parallel data +# and generates the corresponding alignments. +# The parallel data dir has segment ids from distant microphone data +# but the wav data is copied from ihm. + +mic=sdm1 +new_mic=sdm1_cleanali +use_sat_alignments=true +nj=10 +stage=0 + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + + +speed_perturb_datadir() { + mic=$1 + dataset=$2 + extract_features=$3 + + utils/perturb_data_dir_speed.sh 0.9 data/$mic/$dataset data/$mic/temp1 + utils/perturb_data_dir_speed.sh 1.0 data/$mic/$dataset data/$mic/temp2 + utils/perturb_data_dir_speed.sh 1.1 data/$mic/$dataset data/$mic/temp3 + utils/combine_data.sh --extra-files utt2uniq data/$mic/${dataset}_sp data/$mic/temp1 data/$mic/temp2 data/$mic/temp3 + rm -r data/$mic/temp1 data/$mic/temp2 data/$mic/temp3 + + if [ "$extract_features" == "true" ]; then + mfccdir=mfcc_${mic}_perturbed + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $mfccdir/storage ]; then + utils/create_split_dir.pl /export/b0{1,2,3,4}/$USER/kaldi-data/egs/ami-$mic-$(date +'%m_%d_%H_%M')/s5/$mfccdir/storage $mfccdir/storage + fi + for x in ${dataset}_sp; do + steps/make_mfcc.sh --cmd "$train_cmd" --nj $nj \ + data/$mic/$x exp/make_${mic}_mfcc/$x $mfccdir || exit 1; + steps/compute_cmvn_stats.sh data/$mic/$x exp/make_${mic}_mfcc/$x $mfccdir || exit 1; + done + fi + utils/fix_data_dir.sh data/$mic/${dataset}_sp +} + +if [ $stage -le 0 ]; then + # we will use ihm alignments as targets + # but as the segment names differ we will create a new data dir + local/nnet3/prepare_parallel_datadirs.sh --original-mic $mic \ + --parallel-mic ihm \ + --new-mic $new_mic +fi + +mic=$new_mic +if [ $stage -le 1 ]; then +# extract the features for the parallel data dir which will be used for alignments +# in case there is no speed perturbation + mfccdir=mfcc_${mic} + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $mfccdir/storage ]; then + utils/create_split_dir.pl /export/b0{1,2,3,4}/$USER/kaldi-data/egs/ami-$mic-$(date +'%m_%d_%H_%M')/s5/$mfccdir/storage $mfccdir/storage + fi + + steps/make_mfcc.sh --cmd "$train_cmd" --nj $nj \ + data/${mic}/train_parallel exp/make_${mic}_mfcc/train_parallel $mfccdir || exit 1; + steps/compute_cmvn_stats.sh data/$mic/train_parallel exp/make_${mic}_mfcc/train_parallel $mfccdir || exit 1; + utils/fix_data_dir.sh data/$mic/train_parallel +fi + +if [ $stage -le 2 ]; then + # if we are using the ihm alignments we just need features for the parallel + # data, the actual data is being perturbed just so that we can copy this + # directory to create hiresolution features later + speed_perturb_datadir $mic train_parallel true + speed_perturb_datadir $mic train false +fi + +if [ $stage -le 3 ]; then + # we just need to recreate alignments in case we perturbed the data + # or in the case we are using ihm alignments, else the alignments would already + # have been generated when we built the GMM-HMM systems + data_set=train_parallel_sp + if [ "$use_sat_alignments" == "true" ]; then + gmm_dir=exp/ihm/tri4a + align_script=steps/align_fmllr.sh + else + gmm_dir=exp/ihm/tri3a + align_script=steps/align_si.sh + fi + $align_script --nj $nj --cmd "$train_cmd" \ + data/$mic/train_parallel_sp data/lang $gmm_dir ${gmm_dir}_${mic}_${data_set}_ali || exit 1; +fi + +exit 0; diff --git a/egs/ami/s5/local/nnet3/prepare_perturbed_alignments.sh b/egs/ami/s5/local/nnet3/prepare_perturbed_alignments.sh new file mode 100755 index 00000000000..4c9e26aa13f --- /dev/null +++ b/egs/ami/s5/local/nnet3/prepare_perturbed_alignments.sh @@ -0,0 +1,61 @@ +#!/bin/bash + +# This script creates speed perturbed versions of the training data +# and generates the corresponding alignments + +mic=ihm +nj=10 +stage=0 +use_sat_alignments=true + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +speed_perturb_datadir() { + mic=$1 + dataset=$2 + extract_features=$3 + + utils/perturb_data_dir_speed.sh 0.9 data/$mic/$dataset data/$mic/temp1 + utils/perturb_data_dir_speed.sh 1.0 data/$mic/$dataset data/$mic/temp2 + utils/perturb_data_dir_speed.sh 1.1 data/$mic/$dataset data/$mic/temp3 + utils/combine_data.sh --extra-files utt2uniq data/$mic/${dataset}_sp data/$mic/temp1 data/$mic/temp2 data/$mic/temp3 + rm -r data/$mic/temp1 data/$mic/temp2 data/$mic/temp3 + + if [ "$extract_features" == "true" ]; then + mfccdir=mfcc_${mic}_perturbed + for x in ${dataset}_sp; do + steps/make_mfcc.sh --cmd "$train_cmd" --nj $nj \ + data/$mic/$x exp/make_${mic}_mfcc/$x $mfccdir || exit 1; + steps/compute_cmvn_stats.sh data/$mic/$x exp/make_${mic}_mfcc/$x $mfccdir || exit 1; + done + fi + utils/fix_data_dir.sh data/$mic/${dataset}_sp +} + + +if [ $stage -le 1 ]; then + #Although the nnet will be trained by high resolution data, we still have to perturb the normal data to get the alignment + # _sp stands for speed-perturbed + speed_perturb_datadir $mic train true +fi + + +if [ $stage -le 2 ]; then + # we just need to recreate alignments in case we perturbed the data + # or in the case we are using ihm alignments, else the alignments would already + # have been generated when we built the GMM-HMM systems + data_set=train_sp + if [ "$use_sat_alignments" == "true" ]; then + gmm_dir=exp/$mic/tri4a + align_script=steps/align_fmllr.sh + else + gmm_dir=exp/$mic/tri3a + align_script=steps/align_si.sh + fi + $align_script --nj $nj --cmd "$train_cmd" \ + data/$mic/train_sp data/lang $gmm_dir ${gmm_dir}_${mic}_${data_set}_ali || exit 1; +fi + +exit 0; diff --git a/egs/ami/s5/local/nnet3/run_blstm.sh b/egs/ami/s5/local/nnet3/run_blstm.sh new file mode 100755 index 00000000000..d5dee155ba2 --- /dev/null +++ b/egs/ami/s5/local/nnet3/run_blstm.sh @@ -0,0 +1,46 @@ +stage=0 +train_stage=-10 +mic=ihm +affix=bidirectional +common_egs_dir= +remove_egs=true +use_ihm_ali=false +use_sat_alignments=true + +# BLSTM params +cell_dim=512 +rp_dim=128 +nrp_dim=128 +chunk_left_context=40 +chunk_right_context=40 + +# training options +num_jobs_initial=2 +num_jobs_final=12 +samples_per_iter=20000 +realign_times= +echo "$0 $@" # Print the command line for logging + +if [ -f path.sh ]; then . ./path.sh; fi +. parse_options.sh || exit 1; + +local/nnet3/run_lstm.sh --affix $affix \ + --stage $stage \ + --train-stage $train_stage \ + --lstm-delay " [-1,1] [-2,2] [-3,3] " \ + --label-delay 0 \ + --cell-dim $cell_dim \ + --recurrent-projection-dim $rp_dim \ + --non-recurrent-projection-dim $nrp_dim \ + --common-egs-dir "$common_egs_dir" \ + --chunk-left-context $chunk_left_context \ + --chunk-right-context $chunk_right_context \ + --mic $mic \ + --num-jobs-initial $num_jobs_initial \ + --num-jobs-final $num_jobs_final \ + --samples-per-iter $samples_per_iter \ + --use-ihm-ali $use_ihm_ali \ + --use-sat-alignments $use_sat_alignments \ + --realign-times "$realign_times" \ + --remove-egs $remove_egs + diff --git a/egs/ami/s5/local/nnet3/run_ivector_common.sh b/egs/ami/s5/local/nnet3/run_ivector_common.sh index 227c2fbe209..1b5e64c04fb 100755 --- a/egs/ami/s5/local/nnet3/run_ivector_common.sh +++ b/egs/ami/s5/local/nnet3/run_ivector_common.sh @@ -1,30 +1,63 @@ #!/bin/bash # this script contains some common (shared) parts of the run_nnet*.sh scripts. - -. cmd.sh - +# speed perturbation is done for the training data stage=0 mic=ihm num_threads_ubm=32 -speed_perturb=true +nj=10 +use_ihm_ali=false use_sat_alignments=true -set -e . cmd.sh . ./path.sh . ./utils/parse_options.sh -if [ "$use_sat_alignments" == "true" ] ; then - gmm_dir=exp/$mic/tri4a - align_script=steps/align_fmllr.sh +volume_perturb_datadir() { + dir=$1 + cat $dir/wav.scp | python -c " +import sys, os, subprocess, re, random +scale_low = 1.0/8 +scale_high = 2.0 +for line in sys.stdin.readlines(): + if len(line.strip()) == 0: + continue + print '{0} sox --vol {1} -t wav - -t wav - |'.format(line.strip(), random.uniform(scale_low, scale_high)) +"| sort -k1,1 -u > $dir/wav.scp_scaled || exit 1; + mv $dir/wav.scp $dir/wav.scp_nonorm + mv $dir/wav.scp_scaled $dir/wav.scp +} + +if [ "$use_sat_alignments" == "true" ]; then + gmm=tri4a +else + gmm=tri3a +fi + +if [ "$use_ihm_ali" == "true" ]; then + if [ "$mic" == "ihm" ]; then + echo "This is an IHM setup, using the use_ihm_ali=true options does not make sense. Rerun with use_ihm_ali=false" && exit 1; + fi + # prepare the parallel data directory ${mic}_clean_ali + # generate alignments from the perturbed parallel data + local/nnet3/prepare_parallel_perturbed_alignments.sh --stage $stage \ + --mic $mic \ + --new-mic ${mic}_cleanali \ + --use-sat-alignments $use_sat_alignments + # we are going to modify the mic name as changing the alignments + # changes the ivector extractor + mic=${mic}_cleanali + ali_dir=exp/ihm/${gmm}_${mic}_train_parallel_sp_ali else - gmm_dir=exp/$mic/tri3a - align_script=steps/align_si.sh + # prepare the perturbed data directory and generate alignments + local/nnet3/prepare_perturbed_alignments.sh --stage $stage --mic $mic \ + --use-sat-alignments $use_sat_alignments + + ali_dir=exp/$mic/${gmm}_${mic}_train_sp_ali fi -if [ $stage -le 1 ]; then +if [ $stage -le 4 ]; then # Create high-resolution MFCC features (with 40 cepstra instead of 13). # this shows how you can split across multiple file-systems. we'll split the # MFCC dir across multiple locations. You might want to be careful here, if you @@ -35,96 +68,50 @@ if [ $stage -le 1 ]; then utils/create_split_dir.pl /export/b0{1,2,3,4}/$USER/kaldi-data/egs/ami-$mic-$(date +'%m_%d_%H_%M')/s5/$mfccdir/storage $mfccdir/storage fi - for datadir in train dev eval; do + for datadir in train_sp dev eval; do utils/copy_data_dir.sh data/$mic/$datadir data/$mic/${datadir}_hires - if [ "$datadir" == "train" ]; then - dir=data/$mic/train_hires - cat $dir/wav.scp | python -c " -import sys, os, subprocess, re, random -scale_low = 1.0/8 -scale_high = 2.0 -for line in sys.stdin.readlines(): - if len(line.strip()) == 0: - continue - print '{0} sox --vol {1} -t wav - -t wav - |'.format(line.strip(), random.uniform(scale_low, scale_high)) -"| sort -k1,1 -u > $dir/wav.scp_scaled || exit 1; - mv $dir/wav.scp $dir/wav.scp_nonorm - mv $dir/wav.scp_scaled $dir/wav.scp + if [ "$datadir" == "train_sp" ]; then + volume_perturb_datadir data/$mic/${datadir}_hires fi - steps/make_mfcc.sh --nj 70 --mfcc-config conf/mfcc_hires.conf \ + steps/make_mfcc.sh --nj $nj --mfcc-config conf/mfcc_hires.conf \ --cmd "$train_cmd" data/$mic/${datadir}_hires exp/make_${mic}_hires/$datadir $mfccdir || exit 1; steps/compute_cmvn_stats.sh data/$mic/${datadir}_hires exp/make_${mic}_hires/$mic/$datadir $mfccdir || exit 1; - done + utils/fix_data_dir.sh data/$mic/${datadir}_hires + done fi -if [ $stage -le 2 ]; then +if [ $stage -le 5 ]; then # Train a system just for its LDA+MLLT transform. We use --num-iters 13 # because after we get the transform (12th iter is the last), any further # training is pointless. steps/train_lda_mllt.sh --cmd "$train_cmd" --num-iters 13 \ --realign-iters "" \ --splice-opts "--left-context=3 --right-context=3" \ - 5000 10000 data/$mic/train_hires data/lang \ - ${gmm_dir}_ali exp/$mic/nnet3/tri5 + 5000 10000 data/$mic/train_sp_hires data/lang \ + $ali_dir exp/$mic/nnet3/tri5 fi -if [ $stage -le 3 ]; then +if [ $stage -le 6 ]; then steps/online/nnet2/train_diag_ubm.sh --cmd "$train_cmd" --nj 30 \ --num-frames 700000 \ --num-threads $num_threads_ubm \ - data/$mic/train_hires 512 exp/$mic/nnet3/tri5 exp/$mic/nnet3/diag_ubm + data/$mic/train_sp_hires 512 exp/$mic/nnet3/tri5 exp/$mic/nnet3/diag_ubm fi -if [ $stage -le 4 ]; then +if [ $stage -le 7 ]; then # iVector extractors can in general be sensitive to the amount of data, but # this one has a fairly small dim (defaults to 100) steps/online/nnet2/train_ivector_extractor.sh --cmd "$train_cmd" --nj 10 \ - data/$mic/train_hires exp/$mic/nnet3/diag_ubm exp/$mic/nnet3/extractor || exit 1; -fi - -if [ $stage -le 5 ] && [ "$speed_perturb" == "true" ]; then - #Although the nnet will be trained by high resolution data, we still have to perturbe the normal data to get the alignment - # _sp stands for speed-perturbed - utils/perturb_data_dir_speed.sh 0.9 data/$mic/train data/$mic/temp1 - utils/perturb_data_dir_speed.sh 1.0 data/$mic/train data/$mic/temp2 - utils/perturb_data_dir_speed.sh 1.1 data/$mic/train data/$mic/temp3 - utils/combine_data.sh --extra-files utt2uniq data/$mic/train_sp data/$mic/temp1 data/$mic/temp2 data/$mic/temp3 - rm -r data/$mic/temp1 data/$mic/temp2 data/$mic/temp3 - - mfccdir=mfcc_${mic}_perturbed - for x in train_sp; do - steps/make_mfcc.sh --cmd "$train_cmd" --nj $nj \ - data/$mic/$x exp/make_${mic}_mfcc/$x $mfccdir || exit 1; - steps/compute_cmvn_stats.sh data/$mic/$x exp/make_${mic}_mfcc/$x $mfccdir || exit 1; - done - utils/fix_data_dir.sh data/$mic/train_sp - - $align_script --nj $nj --cmd "$train_cmd" \ - data/$mic/train_sp data/lang $gmm_dir ${gmm_dir}_sp_ali || exit 1 - - #Now perturb the high resolution daa - utils/copy_data_dir.sh data/$mic/train_sp data/$mic/train_sp_hires - mfccdir=mfcc_${mic}_perturbed_hires - for x in train_sp_hires; do - steps/make_mfcc.sh --cmd "$train_cmd" --nj $nj --mfcc-config conf/mfcc_hires.conf \ - data/$mic/$x exp/make_${mic}_hires/$x $mfccdir || exit 1; - steps/compute_cmvn_stats.sh data/$mic/$x exp/make_${mic}_hires/$x $mfccdir || exit 1; - done - utils/fix_data_dir.sh data/$mic/train_sp_hires + data/$mic/train_sp_hires exp/$mic/nnet3/diag_ubm exp/$mic/nnet3/extractor || exit 1; fi -if [ "$speed_perturb" == "true" ]; then - train_set=train_sp -else - train_set=train -fi -if [ $stage -le 6 ]; then - rm exp/$mic/nnet3/.error 2>/dev/null - ivectordir=exp/$mic/nnet3/ivectors_${train}_hires +if [ $stage -le 8 ]; then + rm -f exp/$mic/nnet3/.error 2>/dev/null + ivectordir=exp/$mic/nnet3/ivectors_train_sp_hires if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $ivectordir/storage ]; then utils/create_split_dir.pl /export/b0{1,2,3,4}/$USER/kaldi-data/egs/ami-$mic-$(date +'%m_%d_%H_%M')/s5/$ivectordir/storage $ivectordir/storage fi @@ -135,13 +122,23 @@ if [ $stage -le 6 ]; then # having a larger number of speakers is helpful for generalization, and to # handle per-utterance decoding well (iVector starts at zero). - steps/online/nnet2/copy_data_dir.sh --utts-per-spk-max 2 data/$mic/${train}_hires data/$mic/${train}_hires_max2 + steps/online/nnet2/copy_data_dir.sh --utts-per-spk-max 2 data/$mic/train_sp_hires data/$mic/train_sp_hires_max2 steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 30 \ - data/$mic/${train}_hires_max2 \ + data/$mic/train_sp_hires_max2 \ exp/$mic/nnet3/extractor \ - exp/$mic/nnet3/ivectors_${train}_hires \ + exp/$mic/nnet3/ivectors_train_sp_hires \ || touch exp/$mic/nnet3/.error [ -f exp/$mic/nnet3/.error ] && echo "$0: error extracting iVectors." && exit 1; fi +if [ $stage -le 9 ]; then + rm -f exp/$mic/nnet3/.error 2>/dev/null + for data in dev eval; do + steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 8 \ + data/$mic/${data}_hires exp/$mic/nnet3/extractor exp/$mic/nnet3/ivectors_${data} || touch exp/$mic/nnet3/.error & + done + wait + [ -f exp/$mic/nnet3/.error ] && echo "$0: error extracting iVectors." && exit 1; +fi + exit 0; diff --git a/egs/ami/s5/local/nnet3/run_lstm.sh b/egs/ami/s5/local/nnet3/run_lstm.sh index c98d8340278..d077d14cc1e 100755 --- a/egs/ami/s5/local/nnet3/run_lstm.sh +++ b/egs/ami/s5/local/nnet3/run_lstm.sh @@ -1,119 +1,191 @@ #!/bin/bash -# this is a basic lstm script +# Copyright 2015 Johns Hopkins University (Author: Daniel Povey). +# 2015 Vijayaditya Peddinti +# 2015 Xingyu Na +# 2015 Pegah Ghahrmani +# Apache 2.0. + + +# this is a basic lstm script, it can also be used to train blstm models. +# the blstm can be run using local/nnet3/run_blstm.sh which invokes this script +# with the necessary parameters +# Note: lstm script runs for more epochs than the tdnn script # At this script level we don't support not running on GPU, as it would be painfully slow. -# If you want to run without GPU you'd have to call lstm/train.sh with --gpu false, -# --num-threads 16 and --minibatch-size 128. -set -e +# If you want to run without GPU you'd have to call lstm/train.sh with --gpu false stage=0 train_stage=-10 -has_fisher=true mic=ihm -use_sat_alignments=true +use_ihm_ali=false +use_sat_alignments=false # if true, use tri4a alignments are used + # by default GMM-HMM systems are not built to this stage + # in SDM and MDM systems. So run the tri4a stage if you + # want to use this option affix= -speed_perturb=true -splice_indexes="-2,-1,0,1,2 0" common_egs_dir= -. cmd.sh +# LSTM options +splice_indexes="-2,-1,0,1,2 0 0" +lstm_delay=" -1 -2 -3 " +label_delay=5 +num_lstm_layers=3 +cell_dim=1024 +hidden_dim=1024 +recurrent_projection_dim=256 +non_recurrent_projection_dim=256 +chunk_width=20 +chunk_left_context=40 +chunk_right_context=0 +shrink=0.99 +max_param_change=2.0 + +# training options +num_epochs=10 +initial_effective_lrate=0.0003 +final_effective_lrate=0.00003 +num_jobs_initial=2 +num_jobs_final=12 +momentum=0.5 +num_chunk_per_minibatch=100 +samples_per_iter=20000 +remove_egs=true +realign_times= + +# feature options +use_ivectors=true + +#decode options +extra_left_context= +extra_right_context= +frames_per_chunk= +decode_iter= + +# End configuration section. + +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh . ./path.sh . ./utils/parse_options.sh if ! cuda-compiled; then - cat < # minumum LM-weight for lattice rescoring " echo " --max_lmwt # maximum LM-weight for lattice rescoring " - echo " --reverse (true/false) # score with time reversed features " exit 1; fi @@ -30,9 +30,9 @@ data=$1 lang=$2 # Note: may be graph directory not lang directory, but has the necessary stuff copied. dir=$3 -model=$dir/../final.mdl # assume model one level up from decoding dir. +model=$dir/../$iter.mdl # assume model one level up from decoding dir. -hubscr=$KALDI_ROOT/tools/sctk/bin/hubscr.pl +hubscr=$KALDI_ROOT/tools/sctk/bin/hubscr.pl [ ! -f $hubscr ] && echo "Cannot find scoring program at $hubscr" && exit 1; hubdir=`dirname $hubscr` @@ -41,56 +41,93 @@ for f in $data/stm $data/glm $lang/words.txt $lang/phones/word_boundary.int \ [ ! -f $f ] && echo "$0: expecting file $f to exist" && exit 1; done +if [ -f $dir/../frame_shift ]; then + frame_shift_opt="--frame-shift=$(cat $dir/../frame_shift)" + echo "$0: $dir/../frame_shift exists, using $frame_shift_opt" +elif [ -f $dir/../frame_subsampling_factor ]; then + factor=$(cat $dir/../frame_subsampling_factor) || exit 1 + frame_shift_opt="--frame-shift=0.0$factor" + echo "$0: $dir/../frame_subsampling_factor exists, using $frame_shift_opt" +fi + name=`basename $data`; # e.g. eval2000 +nj=$(cat $dir/num_jobs) mkdir -p $dir/ascoring/log if [ $stage -le 0 ]; then - if $reverse; then - $cmd LMWT=$min_lmwt:$max_lmwt $dir/ascoring/log/get_ctm.LMWT.log \ - mkdir -p $dir/ascore_LMWT/ '&&' \ - lattice-1best --lm-scale=LMWT "ark:gunzip -c $dir/lat.*.gz|" ark:- \| \ - lattice-reverse ark:- ark:- \| \ - lattice-align-words --reorder=false $lang/phones/word_boundary.int $model ark:- ark:- \| \ - nbest-to-ctm ark:- - \| \ - utils/int2sym.pl -f 5 $lang/words.txt \| \ - utils/convert_ctm.pl $data/segments $data/reco2file_and_channel \ - '>' $dir/ascore_LMWT/$name.ctm || exit 1; - else - $cmd LMWT=$min_lmwt:$max_lmwt $dir/ascoring/log/get_ctm.LMWT.log \ - mkdir -p $dir/ascore_LMWT/ '&&' \ - lattice-1best --lm-scale=LMWT "ark:gunzip -c $dir/lat.*.gz|" ark:- \| \ - lattice-align-words $lang/phones/word_boundary.int $model ark:- ark:- \| \ - nbest-to-ctm ark:- - \| \ + for LMWT in $(seq $min_lmwt $max_lmwt); do + rm -f $dir/.error + ( + $cmd JOB=1:$nj $dir/ascoring/log/get_ctm.${LMWT}.JOB.log \ + mkdir -p $dir/ascore_${LMWT}/ '&&' \ + lattice-scale --inv-acoustic-scale=${LMWT} "ark:gunzip -c $dir/lat.JOB.gz|" ark:- \| \ + lattice-limit-depth ark:- ark:- \| \ + lattice-push --push-strings=false ark:- ark:- \| \ + lattice-align-words-lexicon --max-expand=10.0 \ + $lang/phones/align_lexicon.int $model ark:- ark:- \| \ + lattice-to-ctm-conf $frame_shift_opt --decode-mbr=$decode_mbr ark:- - \| \ utils/int2sym.pl -f 5 $lang/words.txt \| \ utils/convert_ctm.pl $data/segments $data/reco2file_and_channel \ - '>' $dir/ascore_LMWT/$name.ctm || exit 1; - fi + '>' $dir/ascore_${LMWT}/${name}.JOB.ctm || touch $dir/.error; + # Merge and clean, + for ((n=1; n<=nj; n++)); do cat $dir/ascore_${LMWT}/${name}.${n}.ctm; done > $dir/ascore_${LMWT}/${name}.ctm + rm -f $dir/ascore_${LMWT}/${name}.*.ctm + )& + done + wait; + [ -f $dir/.error ] && echo "$0: error during ctm generation. check $dir/ascoring/log/get_ctm.*.log" && exit 1; fi if [ $stage -le 1 ]; then # Remove some stuff we don't want to score, from the ctm. - for x in $dir/ascore_*/$name.ctm; do +# - we remove hesitations here, otherwise the CTM would have a bug! +# (confidences in place of the removed hesitations), + for x in $dir/ascore_*/${name}.ctm; do cp $x $dir/tmpf; cat $dir/tmpf | grep -i -v -E '\[noise|laughter|vocalized-noise\]' | \ + grep -i -v -E ' (ACH|AH|EEE|EH|ER|EW|HA|HEE|HM|HMM|HUH|MM|OOF|UH|UM) ' | \ grep -i -v -E '' > $x; # grep -i -v -E '|%HESITATION' > $x; done fi -if [ $stage -le 2 ]; then +if [ $stage -le 2 ]; then if [ "$asclite" == "true" ]; then oname=$name [ ! -z $overlap_spk ] && oname=${name}_o$overlap_spk + echo "asclite is starting" + # Run scoring, meaning of hubscr.pl options: + # -G .. produce alignment graphs, + # -v .. verbose, + # -m .. max-memory in GBs, + # -o .. max N of overlapping speakers, + # -a .. use asclite, + # -C .. compression for asclite, + # -B .. blocksize for asclite (kBs?), + # -p .. path for other components, + # -V .. skip validation of input transcripts, + # -h rt-stt .. removes non-lexical items from CTM, $cmd LMWT=$min_lmwt:$max_lmwt $dir/ascoring/log/score.LMWT.log \ cp $data/stm $dir/ascore_LMWT/ '&&' \ cp $dir/ascore_LMWT/${name}.ctm $dir/ascore_LMWT/${oname}.ctm '&&' \ $hubscr -G -v -m 1:2 -o$overlap_spk -a -C -B 8192 -p $hubdir -V -l english \ - -h rt-stt -g $data/glm -r $dir/ascore_LMWT/stm $dir/ascore_LMWT/${oname}.ctm || exit 1; + -h rt-stt -g $data/glm -r $dir/ascore_LMWT/stm $dir/ascore_LMWT/${oname}.ctm || exit 1 + # Compress some scoring outputs : alignment info and graphs, + echo -n "compressing asclite outputs " + for LMWT in $(seq $min_lmwt $max_lmwt); do + ascore=$dir/ascore_${LMWT} + gzip -f $ascore/${oname}.ctm.filt.aligninfo.csv + cp $ascore/${oname}.ctm.filt.alignments/index.html $ascore/${oname}.ctm.filt.overlap.html + tar -C $ascore -czf $ascore/${oname}.ctm.filt.alignments.tar.gz ${oname}.ctm.filt.alignments + rm -r $ascore/${oname}.ctm.filt.alignments + echo -n "LMWT:$LMWT " + done + echo done else $cmd LMWT=$min_lmwt:$max_lmwt $dir/ascoring/log/score.LMWT.log \ cp $data/stm $dir/ascore_LMWT/ '&&' \ - $hubscr -p $hubdir -V -l english -h hub5 -g $data/glm -r $dir/ascore_LMWT/stm $dir/ascore_LMWT/${name}.ctm || exit 1 + $hubscr -p $hubdir -v -V -l english -h hub5 -g $data/glm -r $dir/ascore_LMWT/stm $dir/ascore_LMWT/${name}.ctm || exit 1 fi fi diff --git a/egs/ami/s5/local/sort_bad_utts.py b/egs/ami/s5/local/sort_bad_utts.py new file mode 100644 index 00000000000..f84fcb12608 --- /dev/null +++ b/egs/ami/s5/local/sort_bad_utts.py @@ -0,0 +1,67 @@ +#!/usr/bin/env python + +import sys +import argparse +import logging + +logger = logging.getLogger(__name__) +logger.setLevel(logging.INFO) +handler = logging.StreamHandler() +handler.setLevel(logging.INFO) +formatter = logging.Formatter('%(asctime)s [%(filename)s:%(lineno)s - %(funcName)s - %(levelname)s ] %(message)s') +handler.setFormatter(formatter) +logger.addHandler(handler) + +def GetArgs(): + # we add compulsary arguments as named arguments for readability + parser = argparse.ArgumentParser(description=""" """, + formatter_class=argparse.ArgumentDefaultsHelpFormatter) + + # feat options + parser.add_argument("--bad-utt-info-file", type=str, required=True) + parser.add_argument("--output-file", type=str, required=True) + parser.add_argument("--max-wer", type=float, default=20) + + print(' '.join(sys.argv)) + args = parser.parse_args() + + return args + +def GetSortedWers(utt_info_file): + utt_wer = [] + for line in open(utt_info_file, 'r'): + parts = line.split() + utt = parts[0] + wer = float(parts[1])/float(parts[2])*100 + utt_wer.append([utt, wer]) + + utt_wer_sorted = sorted(utt_wer, key = lambda k : k[1]) + try: + import numpy as np + bins = range(0,105,5) + bins.append(sys.float_info.max) + + hist, bin_edges = np.histogram(map(lambda x: x[1], utt_wer_sorted), + bins = bins) + num_utts = len(utt_wer) + string = '' + for i in range(len(hist)): + string += '[{0}, {1}] {2}\n'.format(bin_edges[i], bin_edges[i+1], float(hist[i])/num_utts * 100) + logger.info("The histogram is \n {0}".format(string)) + except ImportError: + pass + + return utt_wer_sorted + +def Main(): + args = GetArgs() + utt_wer_sorted = GetSortedWers(args.bad_utt_info_file) + out_file = open(args.output_file, 'w') + logger.info("Writing output to file : {0}.".format(args.output_file)) + + for row in utt_wer_sorted: + if row[1] <= args.max_wer: + out_file.write('{0} {1}\n'.format(row[0], row[1])) + out_file.close() +if __name__ == "__main__": + Main() diff --git a/egs/ami/s5/path.sh b/egs/ami/s5/path.sh index 52e44195f51..ad2c93b309b 100644 --- a/egs/ami/s5/path.sh +++ b/egs/ami/s5/path.sh @@ -1,11 +1,13 @@ export KALDI_ROOT=`pwd`/../../.. -[ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh -export PATH=$PWD/utils/:$KALDI_ROOT/src/bin:$KALDI_ROOT/tools/openfst/bin:$KALDI_ROOT/src/fstbin/:$KALDI_ROOT/src/gmmbin/:$KALDI_ROOT/src/featbin/:$KALDI_ROOT/src/lm/:$KALDI_ROOT/src/sgmmbin/:$KALDI_ROOT/src/sgmm2bin/:$KALDI_ROOT/src/fgmmbin/:$KALDI_ROOT/src/latbin/:$KALDI_ROOT/src/nnetbin:$KALDI_ROOT/src/nnet2bin/:$KALDI_ROOT/src/nnet3bin/:$KALDI_ROOT/src/kwsbin:$KALDI_ROOT/src/online2bin/:$KALDI_ROOT/src/ivectorbin/:$KALDI_ROOT/src/lmbin/:$PWD:$PATH +[ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh +export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH +[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1 +. $KALDI_ROOT/tools/config/common_path.sh export LC_ALL=C LMBIN=$KALDI_ROOT/tools/irstlm/bin SRILM=$KALDI_ROOT/tools/srilm/bin/i686-m64 -BEAMFORMIT=$KALDI_ROOT/tools/BeamformIt-3.51 +BEAMFORMIT=$KALDI_ROOT/tools/BeamformIt export PATH=$PATH:$LMBIN:$BEAMFORMIT:$SRILM diff --git a/egs/ami/s5/run_ihm.sh b/egs/ami/s5/run_ihm.sh index 4590ba1deb8..b9d60d78182 100755 --- a/egs/ami/s5/run_ihm.sh +++ b/egs/ami/s5/run_ihm.sh @@ -10,21 +10,24 @@ mic=ihm stage=0 . utils/parse_options.sh -# Set bash to 'debug' mode, it will exit on : -# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands', -set -e -set -u -set -o pipefail -set -x +# Set bash to 'debug' mode, it prints the commands (option '-x') and exits on : +# -e 'error', -u 'undefined variable', -o pipefail 'error in pipeline', +set -euxo pipefail # Path where AMI gets downloaded (or where locally available): -[ ! -r conf/ami_dir ] && echo "Please, run 'run_prepare_shared.sh' first!" && exit 1 -AMI_DIR=$(cat conf/ami_dir) - +AMI_DIR=$PWD/wav_db # Default, +case $(hostname -d) in + fit.vutbr.cz) AMI_DIR=/mnt/scratch05/iveselyk/KALDI_AMI_WAV ;; # BUT, + clsp.jhu.edu) AMI_DIR=/export/corpora4/ami/amicorpus ;; # JHU, + cstr.ed.ac.uk) AMI_DIR= ;; # Edinburgh, +esac + +[ ! -r data/local/lm/final_lm ] && echo "Please, run 'run_prepare_shared.sh' first!" && exit 1 final_lm=`cat data/local/lm/final_lm` LM=$final_lm.pr1-7 # Download AMI corpus, You need arount 130GB of free space to get whole data ihm+mdm, +# Avoiding re-download, using 'wget --continue ...', if [ $stage -le 0 ]; then [ -e data/local/downloads/wget_$mic.sh ] && \ echo "$data/local/downloads/wget_$mic.sh already exists, better quit than re-download... (use --stage N)" && \ @@ -54,9 +57,8 @@ fi if [ $stage -le 3 ]; then # Taking a subset, now unused, can be handy for quick experiments, - # Full set 77h, reduced set 9.5h, - local/remove_dup_utts.sh 20 data/$mic/train data/$mic/train_nodup # remvove uh-huh, - utils/subset_data_dir.sh --shortest data/$mic/train_nodup 30000 data/$mic/train_30k + # Full set 77h, reduced set 10.8h, + utils/subset_data_dir.sh data/$mic/train 15000 data/$mic/train_15k fi # Train systems, @@ -84,7 +86,7 @@ if [ $stage -le 5 ]; then data/$mic/train data/lang exp/$mic/tri2a exp/$mic/tri2_ali # Decode, graph_dir=exp/$mic/tri2a/graph_${LM} - $highmem_cmd $graph_dir/mkgraph.log \ + $cmd --mem 4G $graph_dir/mkgraph.log \ utils/mkgraph.sh data/lang_${LM} exp/$mic/tri2a $graph_dir steps/decode.sh --nj $nj --cmd "$decode_cmd" --config conf/decode.conf \ $graph_dir data/$mic/dev exp/$mic/tri2a/decode_dev_${LM} @@ -102,26 +104,26 @@ if [ $stage -le 6 ]; then data/$mic/train data/lang exp/$mic/tri3a exp/$mic/tri3a_ali # Decode, graph_dir=exp/$mic/tri3a/graph_${LM} - $highmem_cmd $graph_dir/mkgraph.log \ + $cmd --mem 4G $graph_dir/mkgraph.log \ utils/mkgraph.sh data/lang_${LM} exp/$mic/tri3a $graph_dir steps/decode.sh --nj $nj --cmd "$decode_cmd" --config conf/decode.conf \ - $graph_dir data/$mic/dev exp/$mic/tri3a/decode_dev_${LM} + $graph_dir data/$mic/dev exp/$mic/tri3a/decode_dev_${LM} steps/decode.sh --nj $nj --cmd "$decode_cmd" --config conf/decode.conf \ $graph_dir data/$mic/eval exp/$mic/tri3a/decode_eval_${LM} -fi +fi if [ $stage -le 7 ]; then # Train tri4a, which is LDA+MLLT+SAT, steps/train_sat.sh --cmd "$train_cmd" \ 5000 80000 data/$mic/train data/lang exp/$mic/tri3a_ali exp/$mic/tri4a - # Decode, + # Decode, graph_dir=exp/$mic/tri4a/graph_${LM} $highmem_cmd $graph_dir/mkgraph.log \ utils/mkgraph.sh data/lang_${LM} exp/$mic/tri4a $graph_dir steps/decode_fmllr.sh --nj $nj --cmd "$decode_cmd" --config conf/decode.conf \ - $graph_dir data/$mic/dev exp/$mic/tri4a/decode_dev_${LM} + $graph_dir data/$mic/dev exp/$mic/tri4a/decode_dev_${LM} steps/decode_fmllr.sh --nj $nj --cmd "$decode_cmd" --config conf/decode.conf \ - $graph_dir data/$mic/eval exp/$mic/tri4a/decode_eval_${LM} + $graph_dir data/$mic/eval exp/$mic/tri4a/decode_eval_${LM} fi nj_mmi=80 @@ -158,11 +160,11 @@ if [ $stage -le 11 ]; then decode_dir=exp/$mic/tri4a_mmi_b0.1/decode_dev_${i}.mdl_${LM} steps/decode.sh --nj $nj --cmd "$decode_cmd" --config conf/decode.conf \ --transform-dir exp/$mic/tri4a/decode_dev_${LM} --iter $i \ - $graph_dir data/$mic/dev $decode_dir + $graph_dir data/$mic/dev $decode_dir decode_dir=exp/$mic/tri4a_mmi_b0.1/decode_eval_${i}.mdl_${LM} steps/decode.sh --nj $nj --cmd "$decode_cmd" --config conf/decode.conf \ --transform-dir exp/$mic/tri4a/decode_eval_${LM} --iter $i \ - $graph_dir data/$mic/eval $decode_dir + $graph_dir data/$mic/eval $decode_dir done fi @@ -179,11 +181,11 @@ if [ $stage -le 13 ]; then --hidden-dim 950 \ --splice-indexes "layer0/-2:-1:0:1:2 layer1/-1:2 layer2/-3:3 layer3/-7:2 layer4/-3:3" \ --use-sat-alignments true - + local/online/run_nnet2_ms_sp_disc.sh \ --mic $mic \ --gmm-dir exp/$mic/tri4a \ --srcdir exp/$mic/nnet2_online/nnet_ms_sp fi -echo "Done!" +echo "Done." diff --git a/egs/ami/s5/run_mdm.sh b/egs/ami/s5/run_mdm.sh index 5d1d964e2b1..3c147e0aa99 100755 --- a/egs/ami/s5/run_mdm.sh +++ b/egs/ami/s5/run_mdm.sh @@ -7,28 +7,31 @@ nmics=8 #we use all 8 channels, possible other options are 2 and 4 mic=mdm$nmics -stage=0 -. utils/parse_options.sh - -# Set bash to 'debug' mode, it will exit on : -# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands', -set -e -set -u -set -o pipefail -set -x - # Path where AMI gets downloaded (or where locally available): -[ ! -r conf/ami_dir ] && echo "Please, run 'run_prepare_shared.sh' first!" && exit 1 -AMI_DIR=$(cat conf/ami_dir) +AMI_DIR=$PWD/wav_db # Default, +case $(hostname -d) in + fit.vutbr.cz) AMI_DIR=/mnt/scratch05/iveselyk/KALDI_AMI_WAV ;; # BUT, + clsp.jhu.edu) AMI_DIR=/export/corpora4/ami/amicorpus ;; # JHU, + cstr.ed.ac.uk) AMI_DIR= ;; # Edinburgh, +esac # MDM_DIR is directory for beamformed waves, -MDM_DIR=$AMI_DIR/beamformed # [Default] -#MDM_DIR=/disk/data1/s1136550/ami/mdm # [Edinburgh] +MDM_DIR=$AMI_DIR/beamformed # Default, +#MDM_DIR=/disk/data1/s1136550/ami/mdm # Edinburgh, +[ ! -r data/local/lm/final_lm ] && echo "Please, run 'run_prepare_shared.sh' first!" && exit 1 final_lm=`cat data/local/lm/final_lm` LM=$final_lm.pr1-7 +stage=0 +. utils/parse_options.sh + +# Set bash to 'debug' mode, it prints the commands (option '-x') and exits on : +# -e 'error', -u 'undefined variable', -o pipefail 'error in pipeline', +set -euxo pipefail + # Download AMI corpus (distant channels), You need around 130GB of free space to get whole data ihm+mdm, +# Avoiding re-download, using 'wget --continue ...', if [ $stage -le 0 ]; then [ -e data/local/downloads/wget_mdm.sh ] && \ echo "data/local/downloads/wget_mdm.sh already exists, better quit than re-download... (use --stage N)" && \ @@ -64,9 +67,8 @@ fi if [ $stage -le 4 ]; then # Taking a subset, now unused, can be handy for quick experiments, - # Full set 77h, reduced set 9.5h, - local/remove_dup_utts.sh 20 data/$mic/train data/$mic/train_nodup # remvove uh-huh, - utils/subset_data_dir.sh --shortest data/$mic/train_nodup 30000 data/$mic/train_30k + # Full set 77h, reduced set 10.8h, + utils/subset_data_dir.sh data/$mic/train 15000 data/$mic/train_15k fi # Train systems, @@ -179,5 +181,4 @@ if [ $stage -le 13 ]; then --srcdir exp/$mic/nnet2_online/nnet_ms_sp fi - -echo "Done!" +echo "Done." diff --git a/egs/ami/s5/run_prepare_shared.sh b/egs/ami/s5/run_prepare_shared.sh index b931e910bb9..903de4125b8 100755 --- a/egs/ami/s5/run_prepare_shared.sh +++ b/egs/ami/s5/run_prepare_shared.sh @@ -3,34 +3,17 @@ . ./cmd.sh . ./path.sh -# To run this script you need SRILM, - # Path to Fisher transcripts LM interpolation (if not defined only AMI transcript LM is built), -FISHER_TRANS=`pwd`/eddie_data/lm/data/fisher/part1 # Edinburgh, [DEFAULT] -# Path where AMI gets downloaded (or where locally available), -AMI_DIR=$PWD/DOWNLOAD/amicorpus # [DEFAULT] - -# We can make setup specific to the 'domain' where the cluster is, -case "$(hostname -d)" in - fit.vutbr.cz) # BUT cluster, - FISHER_TRANS=/mnt/matylda2/data/FISHER/fe_03_p1_tran - AMI_DIR=$(mktemp -d $(find /mnt/scratch*/$USER -maxdepth 0)/kaldi_ami_data_XXXXXX) - ;; - *) echo "Using defaults locations," - ;; +case $(hostname -d) in + fit.vutbr.cz) FISHER_TRANS=/mnt/matylda2/data/FISHER/fe_03_p1_tran ;; # BUT, + clsp.jhu.edu) FISHER_TRANS=/export/corpora4/ami/fisher_trans/part1 ;; # JHU, + cstr.ed.ac.uk) FISHER_TRANS=`pwd`/eddie_data/lm/data/fisher/part1 ;; # Edinburgh, esac +# Or select manually, +# FISHER_TRANS=... -# We can override the automatic setup by : -# './run_prepare_shared.sh --AMI-DIR [dir] --FISHER-TRANS [dir]' . utils/parse_options.sh -# Load previous / store the new AMI_DIR location, -[ -r conf/ami_dir ] && AMI_DIR=$(cat conf/ami_dir) || echo $AMI_DIR >conf/ami_dir - -if [ -z $IRSTLM ] ; then - export IRSTLM=$KALDI_ROOT/tools/irstlm/ -fi -export PATH=${PATH}:$IRSTLM/bin if ! command -v prune-lm >/dev/null 2>&1 ; then echo "$0: Error: the IRSTLM is not available or compiled" >&2 echo "$0: Error: We used to install it by default, but." >&2 @@ -40,13 +23,19 @@ if ! command -v prune-lm >/dev/null 2>&1 ; then exit 1 fi -# Set bash to 'debug' mode, it will exit on : -# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands', -set -e -set -u -set -x +if ! command -v ngram-count >/dev/null 2>&1 ; then + echo "$0: Error: the SRILM is not available or compiled" >&2 + echo "$0: Error: To install it, go to $KALDI_ROOT/tools" >&2 + echo "$0: Error: and run extras/install_srilm.sh" >&2 + exit 1 +fi + +# Set bash to 'debug' mode, it prints the commands (option '-x') and exits on : +# -e 'error', -u 'undefined variable', -o pipefail 'error in pipeline', +set -euxo pipefail -local/ami_text_prep.sh $AMI_DIR +# Download of annotations, pre-processing, +local/ami_text_prep.sh data/local/downloads local/ami_prepare_dict.sh utils/prepare_lang.sh data/local/dict "" data/local/lang data/lang @@ -58,6 +47,6 @@ LM=$final_lm.pr1-7 prune-lm --threshold=1e-7 data/local/lm/$final_lm.gz /dev/stdout | gzip -c > data/local/lm/$LM.gz utils/format_lm.sh data/lang data/local/lm/$LM.gz data/local/dict/lexicon.txt data/lang_$LM -echo "Done!" +echo "Done" exit 0 diff --git a/egs/ami/s5/run_sdm.sh b/egs/ami/s5/run_sdm.sh old mode 100644 new mode 100755 index 3ae7e2c67df..99dd80941e4 --- a/egs/ami/s5/run_sdm.sh +++ b/egs/ami/s5/run_sdm.sh @@ -3,31 +3,34 @@ . ./cmd.sh . ./path.sh -# SDM - Signle Distant Microphone +# SDM - Signle Distant Microphone micid=1 #which mic from array should be used? mic=sdm$micid -stage=0 +stage=1 . utils/parse_options.sh -# Set bash to 'debug' mode, it will exit on : -# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands', -set -e -set -u -set -o pipefail -set -x +# Set bash to 'debug' mode, it prints the commands (option '-x') and exits on : +# -e 'error', -u 'undefined variable', -o pipefail 'error in pipeline', +set -euxo pipefail # Path where AMI gets downloaded (or where locally available): -[ ! -r conf/ami_dir ] && echo "Please, run 'run_prepare_shared.sh' first!" && exit 1 -AMI_DIR=$(cat conf/ami_dir) - +AMI_DIR=$PWD/wav_db # Default, +case $(hostname -d) in + fit.vutbr.cz) AMI_DIR=/mnt/scratch05/iveselyk/KALDI_AMI_WAV ;; # BUT, + clsp.jhu.edu) AMI_DIR=/export/corpora4/ami/amicorpus ;; # JHU, + cstr.ed.ac.uk) AMI_DIR= ;; # Edinburgh, +esac + +[ ! -r data/local/lm/final_lm ] && echo "Please, run 'run_prepare_shared.sh' first!" && exit 1 final_lm=`cat data/local/lm/final_lm` LM=$final_lm.pr1-7 # Download AMI corpus (distant channels), You need arount 130GB of free space to get whole data ihm+mdm, -if [ $stage -le 0 ]; then +# Avoiding re-download, using 'wget --continue ...', +if [ $stage -le 1 ]; then [ -e data/local/downloads/wget_sdm.sh ] && \ - echo "$data/local/downloads/wget_sdm.sh already exists, better quit than re-download... (use --stage N)" && \ + echo "data/local/downloads/wget_sdm.sh already exists, better quit than re-download... (use --stage N)" && \ exit 1 local/ami_download.sh --mics $micid sdm $AMI_DIR fi @@ -53,9 +56,8 @@ fi if [ $stage -le 4 ]; then # Taking a subset, now unused, can be handy for quick experiments, - # Full set 77h, reduced set 9.5h, - local/remove_dup_utts.sh 20 data/$mic/train data/$mic/train_nodup # remvove uh-huh, - utils/subset_data_dir.sh --shortest data/$mic/train_nodup 30000 data/$mic/train_30k + # Full set 77h, reduced set 10.8h, + utils/subset_data_dir.sh data/$mic/train 15000 data/$mic/train_15k fi # Train systems, @@ -161,16 +163,36 @@ if [ $stage -le 13 ]; then --hidden-dim 850 \ --splice-indexes "layer0/-2:-1:0:1:2 layer1/-1:2 layer2/-3:3 layer3/-7:2 layer4/-3:3" \ --use-sat-alignments false - + local/online/run_nnet2_ms_sp_disc.sh \ --mic $mic \ --gmm-dir exp/$mic/tri3a \ --srcdir exp/$mic/nnet2_online/nnet_ms_sp fi + +#TDNN training (nnet3) +if [ $stage -le 14 ]; then + local/nnet3/run_tdnn.sh \ + --mic $mic \ + --speed-perturb true \ + --stage 9 \ + --use-sat-alignments false +fi +exit 1; + +#LSTM training (nnet3) +if [ $stage -le 15 ]; then + local/nnet3/run_lstm.sh \ + --mic $mic \ + --train-stage -5 \ + --speed-perturb true \ + --use-sat-alignments false +fi + echo "Done." -# By default we do not build systems adapted to sessions for AMI in distant scnearios +# By default we do not build systems adapted to sessions for AMI in distant scnearios # as this does not help a lot (around 1%), but one can do this by running below code: exit; @@ -186,7 +208,7 @@ graph_dir=exp/$mic/tri4a/graph_${LM} $highmem_cmd $graph_dir/mkgraph.log \ utils/mkgraph.sh data/lang_${LM} exp/$mic/tri4a $graph_dir steps/decode_fmllr.sh --nj $nj_dev --cmd "$decode_cmd" --config conf/decode.conf \ - $graph_dir data/$mic/dev exp/$mic/tri4a/decode_dev_${LM} + $graph_dir data/$mic/dev exp/$mic/tri4a/decode_dev_${LM} steps/decode_fmllr.sh --nj $nj_eval --cmd "$decode_cmd" --config conf/decode.conf \ - $graph_dir data/$mic/eval exp/$mic/tri4a/decode_eval_${LM} + $graph_dir data/$mic/eval exp/$mic/tri4a/decode_eval_${LM} diff --git a/egs/ami/s5/run_sdm_lstm.sh b/egs/ami/s5/run_sdm_lstm.sh deleted file mode 100755 index ca4978f554c..00000000000 --- a/egs/ami/s5/run_sdm_lstm.sh +++ /dev/null @@ -1,201 +0,0 @@ -#!/bin/bash -u - -. ./cmd.sh -. ./path.sh - -# SDM - Signle Distant Microphone -micid=1 #which mic from array should be used? -mic=sdm$micid - -stage=0 -. utils/parse_options.sh - -# Set bash to 'debug' mode, it will exit on : -# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands', -set -e -set -u -set -o pipefail -set -x - -# Path where AMI gets downloaded (or where locally available): -[ ! -r conf/ami_dir ] && echo "Please, run 'run_prepare_shared.sh' first!" && exit 1 -AMI_DIR=$(cat conf/ami_dir) - -final_lm=`cat data/local/lm/final_lm` -LM=$final_lm.pr1-7 - -# Download AMI corpus (distant channels), You need arount 130GB of free space to get whole data ihm+mdm, -if [ $stage -le 0 ]; then - [ -e data/local/downloads/wget_sdm.sh ] && \ - echo "$data/local/downloads/wget_sdm.sh already exists, better quit than re-download... (use --stage N)" && \ - exit 1 - local/ami_download.sh --mics $micid sdm $AMI_DIR -fi - -# Prepare mdm data directories, -if [ $stage -le 2 ]; then - local/ami_sdm_data_prep.sh $AMI_DIR $micid - local/ami_sdm_scoring_data_prep.sh $AMI_DIR $micid dev - local/ami_sdm_scoring_data_prep.sh $AMI_DIR $micid eval -fi -# Here starts the normal recipe, which is mostly shared across mic scenarios, -# - for ihm we adapt to speaker by fMLLR, -# - for sdm and mdm we do not adapt for speaker, but for environment only (cmn), - -# Feature extraction, -if [ $stage -le 3 ]; then - for dset in train dev eval; do - steps/make_mfcc.sh --nj 15 --cmd "$train_cmd" data/$mic/$dset data/$mic/$dset/log data/$mic/$dset/data - steps/compute_cmvn_stats.sh data/$mic/$dset data/$mic/$dset/log data/$mic/$dset/data - done - for dset in train eval dev; do utils/fix_data_dir.sh data/$mic/$dset; done -fi - -if [ $stage -le 4 ]; then - # Taking a subset, now unused, can be handy for quick experiments, - # Full set 77h, reduced set 9.5h, - local/remove_dup_utts.sh 20 data/$mic/train data/$mic/train_nodup # remvove uh-huh, - utils/subset_data_dir.sh --shortest data/$mic/train_nodup 30000 data/$mic/train_30k -fi - -# Train systems, -nj=30 # number of parallel jobs, -nj_dev=$(cat data/$mic/dev/spk2utt | wc -l) -nj_eval=$(cat data/$mic/eval/spk2utt | wc -l) - -if [ $stage -le 5 ]; then - # Mono, - steps/train_mono.sh --nj $nj --cmd "$train_cmd" --cmvn-opts "--norm-means=true --norm-vars=false" \ - data/$mic/train data/lang exp/$mic/mono - steps/align_si.sh --nj $nj --cmd "$train_cmd" \ - data/$mic/train data/lang exp/$mic/mono exp/$mic/mono_ali - - # Deltas, - steps/train_deltas.sh --cmd "$train_cmd" --cmvn-opts "--norm-means=true --norm-vars=false" \ - 5000 80000 data/$mic/train data/lang exp/$mic/mono_ali exp/$mic/tri1 - steps/align_si.sh --nj $nj --cmd "$train_cmd" \ - data/$mic/train data/lang exp/$mic/tri1 exp/$mic/tri1_ali -fi - -if [ $stage -le 6 ]; then - # Deltas again, (full train-set), - steps/train_deltas.sh --cmd "$train_cmd" --cmvn-opts "--norm-means=true --norm-vars=false" \ - 5000 80000 data/$mic/train data/lang exp/$mic/tri1_ali exp/$mic/tri2a - steps/align_si.sh --nj $nj --cmd "$train_cmd" \ - data/$mic/train data/lang exp/$mic/tri2a exp/$mic/tri2_ali - # Decode, - graph_dir=exp/$mic/tri2a/graph_${LM} - $highmem_cmd $graph_dir/mkgraph.log \ - utils/mkgraph.sh data/lang_${LM} exp/$mic/tri2a $graph_dir - steps/decode.sh --nj $nj_dev --cmd "$decode_cmd" --config conf/decode.conf \ - $graph_dir data/$mic/dev exp/$mic/tri2a/decode_dev_${LM} - steps/decode.sh --nj $nj_eval --cmd "$decode_cmd" --config conf/decode.conf \ - $graph_dir data/$mic/eval exp/$mic/tri2a/decode_eval_${LM} -fi - -# THE TARGET LDA+MLLT+SAT+BMMI PART GOES HERE: - -if [ $stage -le 7 ]; then - # Train tri3a, which is LDA+MLLT, - steps/train_lda_mllt.sh --cmd "$train_cmd" \ - --splice-opts "--left-context=3 --right-context=3" \ - 5000 80000 data/$mic/train data/lang exp/$mic/tri2_ali exp/$mic/tri3a - # Decode, - graph_dir=exp/$mic/tri3a/graph_${LM} - $highmem_cmd $graph_dir/mkgraph.log \ - utils/mkgraph.sh data/lang_${LM} exp/$mic/tri3a $graph_dir - steps/decode.sh --nj $nj_dev --cmd "$decode_cmd" --config conf/decode.conf \ - $graph_dir data/$mic/dev exp/$mic/tri3a/decode_dev_${LM} - steps/decode.sh --nj $nj_eval --cmd "$decode_cmd" --config conf/decode.conf \ - $graph_dir data/$mic/eval exp/$mic/tri3a/decode_eval_${LM} -fi - -# skip SAT, and build MMI models -nj_mmi=80 -if [ $stage -le 8 ]; then - steps/align_si.sh --nj $nj_mmi --cmd "$train_cmd" \ - data/$mic/train data/lang exp/$mic/tri3a exp/$mic/tri3a_ali -fi - -# At this point you can already run the DNN script: -# local/nnet/run_dnn_lda_mllt.sh -# exit 0 - -if [ $stage -le 9 ]; then - steps/make_denlats.sh --nj $nj_mmi --cmd "$decode_cmd" --config conf/decode.conf \ - data/$mic/train data/lang exp/$mic/tri3a exp/$mic/tri3a_denlats -fi - -# 4 iterations of MMI seems to work well overall. The number of iterations is -# used as an explicit argument even though train_mmi.sh will use 4 iterations by -# default. -if [ $stage -le 10 ]; then - num_mmi_iters=4 - steps/train_mmi.sh --cmd "$train_cmd" --boost 0.1 --num-iters $num_mmi_iters \ - data/$mic/train data/lang exp/$mic/tri3a_ali exp/$mic/tri3a_denlats \ - exp/$mic/tri3a_mmi_b0.1 -fi -if [ $stage -le 11 ]; then - # Decode, - graph_dir=exp/$mic/tri3a/graph_${LM} - for i in 4 3 2 1; do - decode_dir=exp/$mic/tri3a_mmi_b0.1/decode_dev_${i}.mdl_${LM} - steps/decode.sh --nj $nj_dev --cmd "$decode_cmd" --config conf/decode.conf \ - --iter $i $graph_dir data/$mic/dev $decode_dir - decode_dir=exp/$mic/tri3a_mmi_b0.1/decode_eval_${i}.mdl_${LM} - steps/decode.sh --nj $nj_eval --cmd "$decode_cmd" --config conf/decode.conf \ - --iter $i $graph_dir data/$mic/eval $decode_dir - done -fi - -# DNN training. This script is based on egs/swbd/s5b/local/run_dnn.sh -# Some of them would be out of date. -if [ $stage -le 12 ]; then - local/nnet/run_dnn_lda_mllt.sh $mic -fi - -# TDNN training. -if [ $stage -le 13 ]; then - local/nnet3/run_tdnn.sh \ - --mic $mic \ - --hidden-dim 850 \ - --speed-perturb true \ - --stage 7 \ - --use-sat-alignments false -fi - -#LSTM training -if [ $stage -le 14 ]; then - local/nnet3/run_lstm.sh \ - --mic $mic \ - --train-stage -5 \ - --speed-perturb true \ - --stage 7 \ - --common-egs-dir exp/sdm1/nnet3/lstm_sp/egs \ - --use-sat-alignments false -fi - - -echo "Done." - - -# By default we do not build systems adapted to sessions for AMI in distant scnearios -# as this does not help a lot (around 1%), but one can do this by running below code: -exit; - -# Train tri4a, which is LDA+MLLT+SAT, -steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \ - data/$mic/train data/lang exp/$mic/tri3a exp/$mic/tri3a_ali-fmllr - -steps/train_sat.sh --cmd "$train_cmd" \ - 5000 80000 data/$mic/train data/lang exp/$mic/tri3a_ali-fmllr exp/$mic/tri4a - -# Decode, -graph_dir=exp/$mic/tri4a/graph_${LM} -$highmem_cmd $graph_dir/mkgraph.log \ - utils/mkgraph.sh data/lang_${LM} exp/$mic/tri4a $graph_dir -steps/decode_fmllr.sh --nj $nj_dev --cmd "$decode_cmd" --config conf/decode.conf \ - $graph_dir data/$mic/dev exp/$mic/tri4a/decode_dev_${LM} -steps/decode_fmllr.sh --nj $nj_eval --cmd "$decode_cmd" --config conf/decode.conf \ - $graph_dir data/$mic/eval exp/$mic/tri4a/decode_eval_${LM} - diff --git a/egs/apiai_decode/s5/README.md b/egs/apiai_decode/s5/README.md new file mode 100644 index 00000000000..c6d9bd23b77 --- /dev/null +++ b/egs/apiai_decode/s5/README.md @@ -0,0 +1,53 @@ +# Api.ai model decoding example scripts +This directory contains scripts on how to use a pre-trained chain enlgish model and kaldi base code to recognize any number of wav files. + +IMPORTANT: wav files must be in 16kHz, 16 bit little-endian format. + +## Model +English pretrained model were released by Api.ai under Creative Commons Attribution-ShareAlike 4.0 International Public License. +- Acustic data is mostly mobile recorded data +- Language model is based on Assistant.ai logs and good for understanding short commands, like "Wake me up at 7 am" +For more details, visit https://github.com/api-ai/api-ai-english-asr-model + +## Usage +Ensure kaldi is compiled and this scripts are inside kaldi/egs// directory then run +```sh +$ ./download-model.sh # to download pretrained chain model +$ ./recognize-wav.sh test1.wav test2.wav # to do recognition +``` +See console output for recognition results. + +### Using steps/nnet3/decode.sh +You can use kaldi steps/nnet3/decode.sh, which will decode data and calculate Word Error Rate (WER) for it. + +Run: +```sh +$ recognize-wav.sh test1.wav test2.wav +``` +It will make data dir, calculate mfcc features for it and do decoding, you need only first two steps out of it. If you want WER then edit data/test-corpus/text and replace NO_TRANSCRIPTION with expected text transcription for every wav file. + +Run for decoding: +```sh +$ steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 --cmd run.pl --nj 1 exp/api.ai-model/ data/test-corpus/ exp/api.ai-model/decode/ +``` +See exp/api.ai-model/decode/wer* files for WER and exp/api.ai-model/decode/log/ files for decoding output. + +### Online Decoder: +See http://kaldi.sourceforge.net/online_decoding.html for more information about kaldi online decoding. + +Run: +```sh +$./local/create-corpus.sh data/test-corpus/ test1.wav test2.wav +``` +If you want WER then edit data/test-corpus/text and replace NO_TRANSCRIPTION with expected text transcription for every wav file. + +Make config file exp/api.ai-model/conf/online.conf with following content: +``` +--feature-type=mfcc +--mfcc-config=exp/api.ai-model/mfcc.conf +``` +Then run: +```sh +$ steps/online/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 --cmd run.pl --nj 1 exp/api.ai-model/ data/test-corpus/ exp/api.ai-model/decode/ +``` +See exp/api.ai-model/decode/wer* files for WER and exp/api.ai-model/decode/log/ files for decoding output. \ No newline at end of file diff --git a/egs/apiai_decode/s5/download-model.sh b/egs/apiai_decode/s5/download-model.sh new file mode 100755 index 00000000000..0847c3fb914 --- /dev/null +++ b/egs/apiai_decode/s5/download-model.sh @@ -0,0 +1,24 @@ +#!/bin/bash +# Downlaods Api.ai chain model into exp/api.ai-model/ (will replace one if exists) + +DOWNLOAD_URL="https://api.ai/downloads/api.ai-kaldi-asr-model.zip" + +echo "Downloading model" +wget -N $DOWNLOAD_URL || ( echo "Unable to download model: $DOWNLOAD_URL" && exit 1 ); + +echo "Unpacking model" +unzip api.ai-kaldi-asr-model.zip || ( echo "Unable to extract api.ai-kaldi-asr-model.zip" && exit 1 ); + +echo "Moving model to exp/api.ai-model/" +if [ ! -d exp ]; then + mkdir exp; +fi; + +if [ -d exp/api.ai-model ]; then + echo "Found existing model, removing"; + rm -rf exp/api.ai-model/ +fi + +mv api.ai-kaldi-asr-model exp/api.ai-model || ( echo "Unable to move model to exp/" && exit 1 ) + +echo "Model is ready to use use recognize-wav.sh to do voice recognition" diff --git a/egs/apiai_decode/s5/local/create-corpus.sh b/egs/apiai_decode/s5/local/create-corpus.sh new file mode 100755 index 00000000000..a101128b4ac --- /dev/null +++ b/egs/apiai_decode/s5/local/create-corpus.sh @@ -0,0 +1,50 @@ +#!/bin/bash + +# Checking arguments +if [ $# -le 1 ]; then + echo "Use $0 test1.wav [test2.wav] ..." + echo " $0 data/test-corpus test1.wav test2.wav" + exit 0; +fi + +CORPUS=$1 +shift +for file in "$@"; do + if [[ "$file" != *.wav ]]; then + echo "Expecting .wav files, got $file" + exit 1; + fi + + if [ ! -f "$file" ]; then + echo "$file not found"; + exit 1; + fi +done; + + +echo "Initilizing $CORPUS" +if [ ! -d "$CORPUS" ]; then + echo "Creating $CORPUS directory" + mkdir -p "$CORPUS" || ( echo "Unable to create data dir" && exit 1 ) +fi; + +wav_scp="$CORPUS/wav.scp" +spk2utt="$CORPUS/spk2utt" +utt2spk="$CORPUS/utt2spk" +text="$CORPUS/text" + +#nulling files +cat $wav_scp +cat $spk2utt +cat $utt2spk +cat $text +rm $CORPUS/feats.scp 2>/dev/null; +rm $CORPUS/cmvn.scp 2>/dev/null; + +for file in "$@"; do + id=$(echo $file | sed -e 's/ /_/g') + echo "$id $file" >>$wav_scp + echo "$id $id" >>$spk2utt + echo "$id $id" >>$utt2spk + echo "$id NO_TRANSRIPTION" >>$text +done; diff --git a/egs/apiai_decode/s5/local/score.sh b/egs/apiai_decode/s5/local/score.sh new file mode 120000 index 00000000000..0afefc3158c --- /dev/null +++ b/egs/apiai_decode/s5/local/score.sh @@ -0,0 +1 @@ +../steps/score_kaldi.sh \ No newline at end of file diff --git a/egs/apiai_decode/s5/path.sh b/egs/apiai_decode/s5/path.sh new file mode 100755 index 00000000000..8b177b18ab2 --- /dev/null +++ b/egs/apiai_decode/s5/path.sh @@ -0,0 +1,6 @@ +export KALDI_ROOT=`pwd`/../../.. +[ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh +export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH +[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/src/path.sh is not present -> Exit!" && exit 1 +. $KALDI_ROOT/tools/config/common_path.sh +export LC_ALL=C diff --git a/egs/apiai_decode/s5/recognize-wav.sh b/egs/apiai_decode/s5/recognize-wav.sh new file mode 100755 index 00000000000..cba3e70a4fc --- /dev/null +++ b/egs/apiai_decode/s5/recognize-wav.sh @@ -0,0 +1,53 @@ +#!/bin/bash +# Copyright 2016 Api.ai (Author: Ilya Platonov) +# Apache 2.0 + +# This script demonstrates kaldi decoding using pretrained model. It will decode list of wav files. +# +# IMPORTANT: wav files must be in 16kHz, 16 bit little-endian format. +# +# This script tries to follow with what other scripts are doing in terms of directory structures and data handling. +# +# Use ./download-model.sh script to download asr model +# See https://github.com/api-ai/api-ai-english-asr-model for details about a model and how to use it. + +. path.sh +MODEL_DIR="exp/api.ai-model" +DATA_DIR="data/test-corpus" + +echo "///////" +echo "// IMPORTANT: wav files must be in 16kHz, 16 bit little-endian format." +echo "//////"; + +for file in final.mdl HCLG.fst words.txt frame_subsampling_factor; do + if [ ! -f $MODEL_DIR/$file ]; then + echo "$MODEL_DIR/$file not found, use ./download-model.sh" + exit 1; + fi +done; + +for app in nnet3-latgen-faster apply-cmvn lattice-scale; do + command -v $app >/dev/null 2>&1 || { echo >&2 "$app not found, is kaldi compiled?"; exit 1; } +done; + +local/create-corpus.sh $DATA_DIR $@ || exit 1; + +echo "///////" +echo "// Computing mfcc and cmvn (cmvn is not really used)" +echo "//////"; + + steps/make_mfcc.sh --nj 1 --mfcc-config $MODEL_DIR/mfcc.conf \ + --cmd "run.pl" $DATA_DIR exp/make_mfcc exp/mfcc || { echo "Unable to calculate mfcc, ensure 16kHz, 16 bit little-endian wav format or see log"; exit 1; }; + steps/compute_cmvn_stats.sh $DATA_DIR exp/make_mfcc/ exp/mfcc || exit 1; + +echo "///////" +echo "// Doing decoding (see log for results)" +echo "//////"; +frame_subsampling_factor=$(cat $MODEL_DIR/frame_subsampling_factor) +nnet3-latgen-faster --frame-subsampling-factor=$frame_subsampling_factor --frames-per-chunk=50 --extra-left-context=0 \ + --extra-right-context=0 --extra-left-context-initial=-1 --extra-right-context-final=-1 \ + --minimize=false --max-active=7000 --min-active=200 --beam=15.0 --lattice-beam=8.0 \ + --acoustic-scale=1.0 --allow-partial=true \ + --word-symbol-table=$MODEL_DIR/words.txt $MODEL_DIR/final.mdl $MODEL_DIR//HCLG.fst \ + "ark,s,cs:apply-cmvn --norm-means=false --norm-vars=false --utt2spk=ark:$DATA_DIR/utt2spk scp:$DATA_DIR/cmvn.scp scp:$DATA_DIR/feats.scp ark:- |" \ + "ark:|lattice-scale --acoustic-scale=10.0 ark:- ark:- >exp/lat.1" \ No newline at end of file diff --git a/egs/apiai_decode/s5/steps b/egs/apiai_decode/s5/steps new file mode 120000 index 00000000000..1b186770dd1 --- /dev/null +++ b/egs/apiai_decode/s5/steps @@ -0,0 +1 @@ +../../wsj/s5/steps/ \ No newline at end of file diff --git a/egs/apiai_decode/s5/utils b/egs/apiai_decode/s5/utils new file mode 120000 index 00000000000..a3279dc8679 --- /dev/null +++ b/egs/apiai_decode/s5/utils @@ -0,0 +1 @@ +../../wsj/s5/utils/ \ No newline at end of file diff --git a/egs/aspire/s5/local/fisher_create_test_lang.sh b/egs/aspire/s5/local/fisher_create_test_lang.sh index e17d95c4b47..924f6e6c4ba 100755 --- a/egs/aspire/s5/local/fisher_create_test_lang.sh +++ b/egs/aspire/s5/local/fisher_create_test_lang.sh @@ -1,4 +1,4 @@ -#!/bin/bash +#!/bin/bash # if [ -f path.sh ]; then . path.sh; fi @@ -10,26 +10,12 @@ arpa_lm=data/local/lm/3gram-mincount/lm_unpruned.gz cp -rT data/lang data/lang_test -# grep -v ' ' etc. is only for future-proofing this script. Our -# LM doesn't have these "invalid combinations". These can cause -# determinization failures of CLG [ends up being epsilon cycles]. -# Note: remove_oovs.pl takes a list of words in the LM that aren't in -# our word list. Since our LM doesn't have any, we just give it -# /dev/null [we leave it in the script to show how you'd do it]. gunzip -c "$arpa_lm" | \ - grep -v ' ' | \ - grep -v ' ' | \ - grep -v ' ' | \ - arpa2fst - | fstprint | \ - utils/remove_oovs.pl /dev/null | \ - utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=data/lang_test/words.txt \ - --osymbols=data/lang_test/words.txt --keep_isymbols=false --keep_osymbols=false | \ - fstrmepsilon | fstarcsort --sort_type=ilabel > data/lang_test/G.fst - fstisstochastic data/lang_test/G.fst - + arpa2fst --disambig-symbol=#0 \ + --read-symbol-table=data/lang_test/words.txt - data/lang_test/G.fst echo "Checking how stochastic G is (the first of these numbers should be small):" -fstisstochastic data/lang_test/G.fst +fstisstochastic data/lang_test/G.fst ## Check lexicon. ## just have a look and make sure it seems sane. @@ -61,4 +47,3 @@ utils/build_const_arpa_lm.sh \ data/local/lm/4gram-mincount/lm_unpruned.gz data/lang data/lang_test_fg echo "$0 succeeded" - diff --git a/egs/aspire/s5/local/multi_condition/check_version.sh b/egs/aspire/s5/local/multi_condition/check_version.sh index 4c9af0b00cf..81c415a3d67 100755 --- a/egs/aspire/s5/local/multi_condition/check_version.sh +++ b/egs/aspire/s5/local/multi_condition/check_version.sh @@ -1,6 +1,23 @@ #!/bin/bash # Script to check the tool versions necessary for the aspire recipe +function check_for_bad_sox { + if which sox >&/dev/null; then # sox is on the path + sox_version=$(sox --version | awk -F 'v' '{print $2}' | awk -F '.' '{print $1 "." $2}') + if [ "$sox_version" == "14.2" ] || [ "$sox_version" == "14.3" ]; then + echo "*** WARNING: your version of sox is either 14.2.x or 14.3.x ***" + echo "*** which may cause errors in the data preparation of this recipe. ***" + echo "*** Please upgrade your sox to version 14.4 or higher. ***" + exit 1; + fi + else + echo "*** This recipe requires sox for the data preparation. ***" + exit 1; + fi +} + +check_for_bad_sox; + python -c " from distutils.version import LooseVersion import warnings, sys diff --git a/egs/aspire/s5/local/multi_condition/prepare_impulses_noises.sh b/egs/aspire/s5/local/multi_condition/prepare_impulses_noises.sh index a7e8f82159c..ca73a447c83 100755 --- a/egs/aspire/s5/local/multi_condition/prepare_impulses_noises.sh +++ b/egs/aspire/s5/local/multi_condition/prepare_impulses_noises.sh @@ -112,6 +112,11 @@ fi # copying the noise-rir pairing files cp ${output_dir}_non_normalized/info/* $output_dir/info +# rename file location in the noise-rir pairing files +for file in `ls $output_dir/info/noise_impulse*`; do + sed -i "s/_non_normalized//g" $file +done + # generating the rir-list with probabilities alloted for each rir db_string_python=$(echo $db_string|sed -e "s/'\s\+'/','/g") python -c " diff --git a/egs/aspire/s5/local/multi_condition/read_rir.py b/egs/aspire/s5/local/multi_condition/read_rir.py index 1229e508d2a..e2510ac7d61 100755 --- a/egs/aspire/s5/local/multi_condition/read_rir.py +++ b/egs/aspire/s5/local/multi_condition/read_rir.py @@ -13,14 +13,14 @@ def read_raw(input_filename, precision = np.float32): def wav_write(file_handle, fs, data): if str(data.dtype) in set(['float64', 'float32']): - data = (0.99 * data / np.max(np.abs(data))) * (2 ** 31) - data = data.astype('int32', copy = False) - elif str(data.dtype) == 'int32': + data = (0.99 * data / np.max(np.abs(data))) * (2 ** 15) + data = data.astype('int16', copy = False) + elif str(data.dtype) == 'int16': pass else: raise Exception('Not implemented for '+str(data.dtype)) scipy.io.wavfile.write(file_handle, fs, data) - + def usage(): return """This is a python script to read impulse responses stored in custom formats. It handles AIR database.""" diff --git a/egs/aspire/s5/local/multi_condition/reverberate_data_dir.sh b/egs/aspire/s5/local/multi_condition/reverberate_data_dir.sh index da0dfb90def..add00c3c5af 100755 --- a/egs/aspire/s5/local/multi_condition/reverberate_data_dir.sh +++ b/egs/aspire/s5/local/multi_condition/reverberate_data_dir.sh @@ -1,6 +1,7 @@ #!/bin/bash # Copyright 2014 Johns Hopkins University (Author: Vijayaditya Peddinti) +# 2015 Tom Ko # Apache 2.0. # This script processes generates multi-condition training data from clean data dir # and directory with impulse responses and noises @@ -9,11 +10,8 @@ set -e random_seed=0 -num_files_per_job=100 snrs="20:10:15:5:0" log_dir=exp/make_reverb -max_jobs_run=50 -dest_wav_dir= . ./path.sh; . ./utils/parse_options.sh @@ -29,13 +27,8 @@ src_dir=$1 impnoise_dir=$2 dest_dir=$3 -if [ -z $dest_wav_dir ]; then - dest_wav_dir=$dest_dir/wavs -fi - mkdir -p $dest_dir mkdir -p $log_dir -mkdir -p $dest_wav_dir wav_prefix="rev${random_seed}_" utt_prefix="rev${random_seed}_" @@ -48,17 +41,9 @@ cat $src_dir/utt2spk | awk -v p=$utt_prefix '{printf("%s%s %s\n", p, $1, $1);}' # create the wav.scp files cat $src_dir/wav.scp | sed -e "s/^\s*//g" | \ cut -d' ' -f1 | \ - awk -v p1=$dest_wav_dir -v p2=$wav_prefix \ - '{printf("%s%s%s.wav\n", p1, p2, $1);}'> $log_dir/corrupted_${random_seed}.list - -python -c " -import re -file_ids = map(lambda x: x.split()[0], open('$src_dir/wav.scp').readlines()) -dest_file_names = map(lambda x: x.split()[0], open('$log_dir/corrupted_${random_seed}.list')) -for file_id, dest_file_name in zip(file_ids, dest_file_names): - print '$wav_prefix{0} cat {1} |'.format(file_id, dest_file_name) -" > $dest_dir/wav.scp - + awk -v p2=$wav_prefix \ + '{printf("%s%s\n", p2, $1);}'> $log_dir/corrupted_${random_seed}.list + # modify segments file to point to the new wav files cat $dest_dir/segments | awk -v p=$wav_prefix \ '{printf("%s %s%s %s %s\n", $1, p, $2, $3, $4);}' > $log_dir/segments_temp @@ -71,13 +56,9 @@ for file in cmvn.scp feats.scp reco2file_and_channel; do rm -f $dest_dir/$file done -python local/multi_condition/get_reverberate_parameter_lists.py \ - --snrs $snrs --num-files-per-job $num_files_per_job --random-seed $random_seed \ +python local/multi_condition/reverberate_wavs.py \ + --snrs $snrs --random-seed $random_seed \ $src_dir/wav.scp $log_dir/corrupted_${random_seed}.list $impnoise_dir \ -$log_dir/corrupt_wavs.${random_seed}.list > $log_dir/num_corruption_jobs || exit 1; - -num_jobs=$(cat $log_dir/num_corruption_jobs) -$decode_cmd -V --max-jobs-run $max_jobs_run JOB=1:$num_jobs $log_dir/corrupt_wavs.${random_seed}.JOB.log \ - python local/multi_condition/corrupt.py --temp-file-name $log_dir/temp_JOB.wav $log_dir/corrupt_wavs.${random_seed}.JOB.list || exit 1; +$dest_dir/wav.scp || exit 1; echo "Successfully generated corrupted data and stored it in $dest_dir." && exit 0; diff --git a/egs/aspire/s5/local/multi_condition/get_reverberate_parameter_lists.py b/egs/aspire/s5/local/multi_condition/reverberate_wavs.py similarity index 56% rename from egs/aspire/s5/local/multi_condition/get_reverberate_parameter_lists.py rename to egs/aspire/s5/local/multi_condition/reverberate_wavs.py index caa39690b23..998a3ed5e74 100755 --- a/egs/aspire/s5/local/multi_condition/get_reverberate_parameter_lists.py +++ b/egs/aspire/s5/local/multi_condition/reverberate_wavs.py @@ -1,5 +1,6 @@ #!/usr/bin/env python # Copyright 2014 Johns Hopkins University (Authors: Vijayaditya Peddinti). Apache 2.0. +# 2015 Tom Ko # script to generate multicondition training data / dev data / test data import argparse, glob, math, os, random, scipy.io.wavfile, sys @@ -23,18 +24,9 @@ def return_nonempty_lines(lines): return new_lines -def exists_wavfile(file_name): - return os.path.isfile(file_name) - try: - scipy.io.wavfile.read(file_name) - return True - except IOError: - return False - if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument('--snrs', type=str, default = '20:10:0', help='snrs to be used for corruption') - parser.add_argument('--num-files-per-job', type=int, default = None, help='number of commands to be stored in each file') parser.add_argument('--check-output-exists', type = str, default = 'True', help = 'process file only if output file does not exist', choices = ['True', 'true', 'False', 'false']) parser.add_argument('--random-seed', type = int, default = 0, help = 'seed to be used in the randomization of impulses') parser.add_argument('wav_file_list', type=str, help='wav.scp file to corrupt') @@ -75,49 +67,32 @@ def exists_wavfile(file_name): raise Exception('Unknown format of ' + file) impulse_noise_index.append([impulses_set, noises_list]) - - if params.num_files_per_job is None: - lines_per_file = len(wav_files) - else: - lines_per_file = params.num_files_per_job - num_parts = int(math.ceil(len(wav_files)/ float(lines_per_file))) - indices_per_file = map(lambda x: xrange(lines_per_file * (x-1), lines_per_file * x), range(1, num_parts)) - indices_per_file.append(xrange(lines_per_file * (num_parts-1), len(wav_files))) - - part_counter = 1 - commands_file_base, ext = os.path.splitext(params.output_command_file) - for indices in indices_per_file: - command_list = [] - for i in indices: - wav_file = " ".join(wav_files[i].split()[1:]) - output_wav_file = wav_out_files[i] - impulse_file = impulses.next() - noise_file = '' - snr = '' - found_impulse = False - if add_noise: - for i in xrange(len(impulse_noise_index)): - if impulse_file in impulse_noise_index[i][0]: - noise_file = impulse_noise_index[i][1].next() - snr = snrs.next() - assert(len(wav_file.strip()) > 0) - assert(len(impulse_file.strip()) > 0) - assert(len(noise_file.strip()) > 0) - assert(len(snr.strip()) > 0) - assert(len(output_wav_file.strip()) > 0) - command_list.append("{0} --rir-file {1} --noise-file {2} --snr-db {3} - {4} \n".format(wav_file, impulse_file, noise_file, snr, output_wav_file)) - found_impulse = True - break - if not found_impulse: - assert(len(wav_file.strip()) > 0) - assert(len(impulse_file.strip()) > 0) - assert(len(output_wav_file.strip()) > 0) - command_list.append("{0} --rir-file {1} - {2} \n".format(wav_file, impulse_file, output_wav_file)) - if exists_wavfile(output_wav_file): - # we perform the check at this point to ensure replication of (wavfile, impulse, noise, snr) tuples across runs. - command_list.pop() - file_handle = open("{0}.{1}{2}".format(commands_file_base, part_counter, ext), 'w') - part_counter += 1 - file_handle.write("".join(command_list)) - file_handle.close() - print num_parts + command_list = [] + for i in range(len(wav_files)): + wav_file = " ".join(wav_files[i].split()[1:]) + output_wav_file = wav_out_files[i] + impulse_file = impulses.next() + noise_file = '' + snr = '' + found_impulse = False + if add_noise: + for i in xrange(len(impulse_noise_index)): + if impulse_file in impulse_noise_index[i][0]: + noise_file = impulse_noise_index[i][1].next() + snr = snrs.next() + assert(len(wav_file.strip()) > 0) + assert(len(impulse_file.strip()) > 0) + assert(len(noise_file.strip()) > 0) + assert(len(snr.strip()) > 0) + assert(len(output_wav_file.strip()) > 0) + command_list.append("{4} {0} wav-reverberate --noise-file={2} --snr-db={3} - {1} - |\n".format(wav_file, impulse_file, noise_file, snr, output_wav_file)) + found_impulse = True + break + if not found_impulse: + assert(len(wav_file.strip()) > 0) + assert(len(impulse_file.strip()) > 0) + assert(len(output_wav_file.strip()) > 0) + command_list.append("{2} {0} wav-reverberate - {1} - |\n".format(wav_file, impulse_file, output_wav_file)) + file_handle = open(params.output_command_file, 'w') + file_handle.write("".join(command_list)) + file_handle.close() diff --git a/egs/aspire/s5/local/multi_condition/rirs/prep_aalto.sh b/egs/aspire/s5/local/multi_condition/rirs/prep_aalto.sh index 341122f73d0..f8a45c3e790 100755 --- a/egs/aspire/s5/local/multi_condition/rirs/prep_aalto.sh +++ b/egs/aspire/s5/local/multi_condition/rirs/prep_aalto.sh @@ -7,6 +7,7 @@ download=true sampling_rate=8k +output_bit=16 DBname=AALTO file_splitter= #script to generate job scripts given the command file @@ -62,9 +63,9 @@ tmpdir=`readlink -e $tmpdir` file_count=1 for data_file in ${data_files[@]}; do # aalto has incompatible format of wav audio, which are not compatible with python's wav.read() function - # so we convert everything to 32bit PCM. - output_file_name=${DBname}_type${type_num}_${file_count}_`basename $data_file| tr '[:upper:]' '[:lower:]'` - echo "sox -t wav $data_file -t wav -r $sampling_rate -e signed-integer -b 32 ${output_dir}/${output_file_name}" >> $command_file +# output_file_name=${DBname}_type${type_num}_${file_count}_`basename $data_file| tr '[:upper:]' '[:lower:]'` + output_file_name=${DBname}_type${type_num}_`basename $data_file| tr '[:upper:]' '[:lower:]'` + echo "sox -t wav $data_file -t wav -r $sampling_rate -e signed-integer -b $output_bit ${output_dir}/${output_file_name}" >> $command_file # echo "python local/multi_condition/read_rir.py --output-sampling-rate $sampling_rate wav ${tmpdir}/$file_count.wav ${output_dir}/${output_file_name} || exit -1;" >> $command_file echo ${output_dir}/${output_file_name} >> $log_dir/${DBname}_type${type_num}.rir.list file_count=$((file_count + 1)) diff --git a/egs/aspire/s5/local/multi_condition/rirs/prep_air.sh b/egs/aspire/s5/local/multi_condition/rirs/prep_air.sh index 9cb7e4fae1d..c7b6300db50 100755 --- a/egs/aspire/s5/local/multi_condition/rirs/prep_air.sh +++ b/egs/aspire/s5/local/multi_condition/rirs/prep_air.sh @@ -51,7 +51,8 @@ command_file=$log_dir/${DBname}_read_rir_noise.sh echo "">$command_file file_count=1 while read file_pattern output_file_name; do - output_file_name=`echo ${DBname}_type${type_num}_${file_count}_$output_file_name| tr '[:upper:]' '[:lower:]'` + # output_file_name=`echo ${DBname}_type${type_num}_${file_count}_$output_file_name| tr '[:upper:]' '[:lower:]'` + output_file_name=`echo ${DBname}_type${type_num}_$output_file_name| tr '[:upper:]' '[:lower:]'` echo "local/multi_condition/read_rir.py --output-sampling-rate $sampling_rate air '${file_pattern}' ${output_dir}/${output_file_name} || exit 1;" >> $command_file echo ${output_dir}/${output_file_name} >> $log_dir/${DBname}_type$type_num.rir.list file_count=$((file_count + 1)) diff --git a/egs/aspire/s5/local/multi_condition/rirs/prep_c4dm.sh b/egs/aspire/s5/local/multi_condition/rirs/prep_c4dm.sh index 513e0c481d7..8e5dd34d9ac 100755 --- a/egs/aspire/s5/local/multi_condition/rirs/prep_c4dm.sh +++ b/egs/aspire/s5/local/multi_condition/rirs/prep_c4dm.sh @@ -8,6 +8,7 @@ download=true sampling_rate=8k +output_bit=16 DBname=C4DM file_splitter= #script to generate job scripts given the command file @@ -91,9 +92,9 @@ tmpdir=`readlink -e $tmpdir` file_count=1 for data_file in ${data_files[@]}; do # c4dm has incompatible format of wav audio, which are not compatible with python's wav.read() function - # so we convert everything to 32bit PCM. - output_file_name=${DBname}_type${type_num}_${file_count}_`basename $data_file| tr '[:upper:]' '[:lower:]'` - echo "sox -t wav $data_file -t wav -r $sampling_rate -e signed-integer -b 32 ${output_dir}/${output_file_name}" >> $command_file + # output_file_name=${DBname}_type${type_num}_${file_count}_`basename $data_file| tr '[:upper:]' '[:lower:]'` + output_file_name=${DBname}_type${type_num}_`basename $data_file| tr '[:upper:]' '[:lower:]'` + echo "sox -t wav $data_file -t wav -r $sampling_rate -e signed-integer -b $output_bit ${output_dir}/${output_file_name}" >> $command_file #echo "python local/multi_condition/read_rir.py --output-sampling-rate $sampling_rate wav ${tmpdir}/${file_count}.wav ${output_dir}/${output_file_name} || exit -1;" >> $command_file echo ${output_dir}/${output_file_name} >> $log_dir/${DBname}_type${type_num}.rir.list file_count=$((file_count + 1)) diff --git a/egs/aspire/s5/local/multi_condition/rirs/prep_mardy.sh b/egs/aspire/s5/local/multi_condition/rirs/prep_mardy.sh index f8c2610293f..4690b9b1861 100755 --- a/egs/aspire/s5/local/multi_condition/rirs/prep_mardy.sh +++ b/egs/aspire/s5/local/multi_condition/rirs/prep_mardy.sh @@ -7,6 +7,7 @@ download=true sampling_rate=8k +output_bit=16 DBname=MARDY file_splitter= #script to generate job scripts given the command file @@ -47,8 +48,9 @@ echo "" > $log_dir/${DBname}_type${type_num}.rir.list echo "Found $total_files impulse responses in ${RIR_home}/mardy/" file_count=1 for data_file in ${data_files[@]}; do - output_file_name=${DBname}_type${type_num}_${file_count}_`basename $data_file| tr '[:upper:]' '[:lower:]'` - echo "sox -t wav $data_file -t wav -r $sampling_rate -e signed-integer -b 32 ${output_dir}/${output_file_name}" >> $command_file + #output_file_name=${DBname}_type${type_num}_${file_count}_`basename $data_file| tr '[:upper:]' '[:lower:]'` + output_file_name=${DBname}_type${type_num}_`basename $data_file| tr '[:upper:]' '[:lower:]'` + echo "sox -t wav $data_file -t wav -r $sampling_rate -e signed-integer -b $output_bit ${output_dir}/${output_file_name}" >> $command_file #echo "python local/multi_condition/read_rir.py --output-sampling-rate $sampling_rate wav ${data_file} ${output_dir}/${output_file_name} || exit -1;" >> $command_file echo ${output_dir}/${output_file_name} >> $log_dir/${DBname}_type${type_num}.rir.list file_count=$((file_count + 1)) diff --git a/egs/aspire/s5/local/multi_condition/rirs/prep_openair.sh b/egs/aspire/s5/local/multi_condition/rirs/prep_openair.sh index 71ec52d0d49..bd43da77079 100755 --- a/egs/aspire/s5/local/multi_condition/rirs/prep_openair.sh +++ b/egs/aspire/s5/local/multi_condition/rirs/prep_openair.sh @@ -7,6 +7,7 @@ download=true sampling_rate=8k +output_bit=16 DBname=OPENAIR file_splitter= #script to generate job scripts given the command file @@ -432,13 +433,14 @@ echo "Found $total_files impulse responses in ${RIR_home}/open_air/" file_count=1 # affix to ensure that files with same name are not overwritten for data_file in ${data_files[@]}; do # open-air has multiple formats of wav audio, some of which are not compatible with python's wav.read() function - # so we convert everything to 32bit PCM. - output_file_name=${DBname}_type${type_num}_${file_count}_`basename $data_file| tr '[:upper:]' '[:lower:]'` - echo "sox -t wav $data_file -t wav -r $sampling_rate -e signed-integer -b 32 ${output_dir}/${output_file_name}" >> $command_file +# output_file_name=${DBname}_type${type_num}_${file_count}_`basename $data_file| tr '[:upper:]' '[:lower:]'` + output_file_name=${DBname}_type${type_num}_`basename $data_file| tr '[:upper:]' '[:lower:]'` + echo "sox -t wav $data_file -t wav -r $sampling_rate -e signed-integer -b $output_bit ${output_dir}/${output_file_name}" >> $command_file echo ${output_dir}/${output_file_name} >> $log_dir/${DBname}_type${type_num}.rir.list file_count=$((file_count + 1)) done + if [ ! -z "$file_splitter" ]; then num_jobs=$($file_splitter $command_file || exit 1) job_file=${command_file%.sh}.JOB.sh diff --git a/egs/aspire/s5/local/multi_condition/rirs/prep_rvb2014.sh b/egs/aspire/s5/local/multi_condition/rirs/prep_rvb2014.sh index 0124038d1b0..32394556f01 100755 --- a/egs/aspire/s5/local/multi_condition/rirs/prep_rvb2014.sh +++ b/egs/aspire/s5/local/multi_condition/rirs/prep_rvb2014.sh @@ -8,6 +8,7 @@ download=true sampling_rate=8k +output_bit=16 DBname=RVB2014 file_splitter= #script to generate job scripts given the command file @@ -57,7 +58,7 @@ echo "" > $log_dir/${DBname}_type${type_num}.rir.list echo "Found $total_files impulse responses in ${Reverb2014_home1}/RIR." for data_file in ${data_files[@]}; do output_file_name=${DBname}_type${type_num}_`basename $data_file | tr '[:upper:]' '[:lower:]'` - echo "sox -t wav $data_file -t wav -r $sampling_rate -e signed-integer -b 32 ${output_dir}/${output_file_name}" >> $command_file + echo "sox -t wav $data_file -t wav -r $sampling_rate -e signed-integer -b $output_bit ${output_dir}/${output_file_name}" >> $command_file echo ${output_dir}/${output_file_name} >> $log_dir/${DBname}_type${type_num}.rir.list files_done=$((files_done + 1)) done @@ -69,7 +70,7 @@ echo "" > $log_dir/${DBname}_type${type_num}.noise.list echo "Found $total_files noises in ${Reverb2014_home1}/NOISE." for data_file in ${data_files[@]}; do output_file_name=${DBname}_type${type_num}_`basename $data_file| tr '[:upper:]' '[:lower:]'` - echo "sox -t wav $data_file -t wav -r $sampling_rate -e signed-integer -b 32 ${output_dir}/${output_file_name}" >> $command_file + echo "sox -t wav $data_file -t wav -r $sampling_rate -e signed-integer -b $output_bit ${output_dir}/${output_file_name}" >> $command_file echo ${output_dir}/${output_file_name} >> $log_dir/${DBname}_type${type_num}.noise.list files_done=$((files_done + 1)) done @@ -83,7 +84,7 @@ echo "" > $log_dir/${DBname}_type${type_num}.rir.list echo "Found $total_files impulse responses in ${Reverb2014_home2}/RIR." for data_file in ${data_files[@]}; do output_file_name=${DBname}_type${type_num}_`basename $data_file| tr '[:upper:]' '[:lower:]'` - echo "sox -t wav $data_file -t wav -r $sampling_rate -e signed-integer -b 32 ${output_dir}/${output_file_name}" >> $command_file + echo "sox -t wav $data_file -t wav -r $sampling_rate -e signed-integer -b $output_bit ${output_dir}/${output_file_name}" >> $command_file echo ${output_dir}/${output_file_name} >> $log_dir/${DBname}_type${type_num}.rir.list files_done=$((files_done + 1)) done @@ -96,7 +97,7 @@ echo "" > $log_dir/${DBname}_type${type_num}.noise.list echo "Found $total_files noises in ${Reverb2014_home2}/NOISE." for data_file in ${data_files[@]}; do output_file_name=${DBname}_type${type_num}_`basename $data_file | tr '[:upper:]' '[:lower:]'` - echo "sox -t wav $data_file -t wav -r $sampling_rate -e signed-integer -b 32 ${output_dir}/${output_file_name}" >> $command_file + echo "sox -t wav $data_file -t wav -r $sampling_rate -e signed-integer -b $output_bit ${output_dir}/${output_file_name}" >> $command_file echo ${output_dir}/${output_file_name} >> $log_dir/${DBname}_type${type_num}.noise.list files_done=$((files_done + 1)) done diff --git a/egs/aspire/s5/local/multi_condition/rirs/prep_rwcp.sh b/egs/aspire/s5/local/multi_condition/rirs/prep_rwcp.sh index aac8efcd340..b44669b86f1 100755 --- a/egs/aspire/s5/local/multi_condition/rirs/prep_rwcp.sh +++ b/egs/aspire/s5/local/multi_condition/rirs/prep_rwcp.sh @@ -14,6 +14,7 @@ download=true sampling_rate=8k +output_bit=16 DBname=RWCP file_splitter= #script to generate job scripts given the command file @@ -73,7 +74,7 @@ for base_dir_name in ${RWCP_dirs[@]}; do for i in `seq $first_channel $last_channel`; do channel_files="$channel_files -t raw -e float -b 32 -c 1 -r 48k $leaf_dir_name/$file_base_name.$i "; done - echo "sox -M $channel_files -r $sampling_rate -e signed-integer -b 32 ${output_dir}/${output_file_name}" >> $command_file + echo "sox -M $channel_files -r $sampling_rate -e signed-integer -b $output_bit ${output_dir}/${output_file_name}" >> $command_file echo ${output_dir}/${output_file_name} >> $log_dir/RWCP_type$type_num.rir.list files_done=$((files_done + 1)) done @@ -105,7 +106,7 @@ for data_file in ${data_files[@]}; do temp_file=$tempdir_robo/$files_done.wav python $tempdir_robo/raw_read.py $data_file $temp_file output_file_name=RWCP_type${type_num}_rir_`basename $data_file .dat | tr '[:upper:]' '[:lower:]'`.wav - echo "sox -t wav $temp_file -r $sampling_rate -e signed-integer -b 32 ${output_dir}/${output_file_name}" >> $command_file + echo "sox -t wav $temp_file -r $sampling_rate -e signed-integer -b $output_bit ${output_dir}/${output_file_name}" >> $command_file echo ${output_dir}/${output_file_name} >> $log_dir/RWCP_type$type_num.rir.list files_done=$((files_done + 1)) done @@ -128,7 +129,7 @@ for leaf_dir_name in ${leaf_directories[@]}; do for i in `seq $first_channel $last_channel`; do channel_files="$channel_files -t raw -e signed-integer -b 16 -c 1 -r 48k $leaf_dir_name/$file_base_name.$i "; done - echo "sox -M $channel_files -r $sampling_rate -e signed-integer -b 32 ${output_dir}/${output_file_name}" >> $command_file + echo "sox -M $channel_files -r $sampling_rate -e signed-integer -b $output_bit ${output_dir}/${output_file_name}" >> $command_file echo ${output_dir}/${output_file_name} >> $log_dir/RWCP_type$type_num.noise.list files_done=$((files_done + 1)) diff --git a/egs/aspire/s5/local/multi_condition/rirs/prep_varechoic.sh b/egs/aspire/s5/local/multi_condition/rirs/prep_varechoic.sh index 6a0df08eb8a..4be2b1779f3 100755 --- a/egs/aspire/s5/local/multi_condition/rirs/prep_varechoic.sh +++ b/egs/aspire/s5/local/multi_condition/rirs/prep_varechoic.sh @@ -7,6 +7,7 @@ download=true sampling_rate=8k +output_bit=16 DBname=VARECHOIC file_splitter= #script to generate job scripts given the command file @@ -47,7 +48,7 @@ varechoic_home=$RIR_home/icsi_varechoic/varechoic for room_type in ir00 ir43 ir100 ; do for mike in m1 m2 m3 m4; do file_basename=${room_type}${mike} - echo "sox -B -e float -b 32 -c 1 -r 8k -t raw $varechoic_home/${file_basename}.raw -t wav -b 32 $output_dir/${DBname}_${file_basename}.wav" >> $command_file + echo "sox -B -e float -b 32 -c 1 -r 8k -t raw $varechoic_home/${file_basename}.raw -t wav -b $output_bit $output_dir/${DBname}_${file_basename}.wav" >> $command_file echo $output_dir/${DBname}_${file_basename}.wav >> $log_dir/${DBname}_type$type_num.rir.list done done diff --git a/egs/aspire/s5/local/multi_condition/run_nnet2_common.sh b/egs/aspire/s5/local/multi_condition/run_nnet2_common.sh index 11224d8e841..5b6424a1d86 100755 --- a/egs/aspire/s5/local/multi_condition/run_nnet2_common.sh +++ b/egs/aspire/s5/local/multi_condition/run_nnet2_common.sh @@ -8,12 +8,11 @@ stage=1 snrs="20:10:15:5:0" num_data_reps=3 -dest_wav_dir=data/rvb_wavs # directory to store the reverberated wav files ali_dir=exp/ db_string="'air' 'rwcp' 'rvb2014'" # RIR dbs to be used in the experiment # only dbs used for ASpIRE submission system have been used here RIR_home=db/RIR_databases/ # parent directory of the RIR databases files -download_rirs=false # download the RIR databases from the urls or assume they are present in the RIR_home directory +download_rirs=true # download the RIR databases from the urls or assume they are present in the RIR_home directory set -e . cmd.sh @@ -40,17 +39,15 @@ if [ $stage -le 1 ]; then else num_reps=1 fi - mkdir -p data/${data_dir}_rvb/wavs reverb_data_dirs= for i in `seq 1 $num_reps`; do cur_dest_dir=" data/temp_${data_dir}_${i}" - local/multi_condition/reverberate_data_dir.sh --random-seed $i --log-dir exp/make_reverb/log \ - --dest-wav-dir ${dest_wav_dir}/wavs${i}/ \ + local/multi_condition/reverberate_data_dir.sh --random-seed $i \ --snrs "$snrs" --log-dir exp/make_corrupted_wav \ data/${data_dir} data/impulses_noises $cur_dest_dir reverb_data_dirs+=" $cur_dest_dir" done - utils/combine_data.sh --extra-files utt2uniq data/${data_dir}_rvb_hires $reverb_data_dirs + utils/combine_data.sh --extra-files utt2uniq data/${data_dir}_rvb $reverb_data_dirs rm -rf $reverb_data_dirs done @@ -63,7 +60,7 @@ if [ $stage -le 1 ]; then local/multi_condition/copy_ali_dir.sh --utt-prefix "rev${i}_" exp/tri5a exp/tri5a_temp_$i || exit 1; ali_dirs+=" exp/tri5a_temp_$i" done - local/multi_condition/combine_ali_dirs.sh --ref-data-dir data/train_rvb_hires \ + local/multi_condition/combine_ali_dirs.sh --ref-data-dir data/train_rvb \ exp/tri5a_rvb_ali $ali_dirs || exit 1; # copy the alignments for training the 100k system (from tri4a) diff --git a/egs/aspire/s5/local/multi_condition/run_nnet2_ms.sh b/egs/aspire/s5/local/multi_condition/run_nnet2_ms.sh index 8119ff44661..4be5efe25ec 100755 --- a/egs/aspire/s5/local/multi_condition/run_nnet2_ms.sh +++ b/egs/aspire/s5/local/multi_condition/run_nnet2_ms.sh @@ -13,7 +13,6 @@ stage=1 train_stage=-10 use_gpu=true dir=exp/nnet2_multicondition/nnet_ms_a -dest_wav_dir=data/rvb_wavs # directory to store the reverberated wav files set -e . cmd.sh @@ -52,7 +51,7 @@ else fi # do the common parts of the script. -local/multi_condition/run_nnet2_common.sh --dest-wav-dir $dest_wav_dir --stage $stage +local/multi_condition/run_nnet2_common.sh --stage $stage if [ $stage -le 7 ]; then diff --git a/egs/aspire/s5/local/multi_condition/run_nnet2_ms_disc.sh b/egs/aspire/s5/local/multi_condition/run_nnet2_ms_disc.sh index 9fcf134ccce..ad5fba0929f 100755 --- a/egs/aspire/s5/local/multi_condition/run_nnet2_ms_disc.sh +++ b/egs/aspire/s5/local/multi_condition/run_nnet2_ms_disc.sh @@ -85,7 +85,7 @@ if [ $stage -le 2 ]; then # hardcode no-GPU for alignment, although you could use GPU [you wouldn't # get excellent GPU utilization though.] nj=1500 # this is 6k hours, use more jobs and control the speed dynamically using - # throttle control option (-tc with qalter) + # throttle control option (--max-jobs-run with qalter) # have a high number of jobs because this could take a while, and we might # have some stragglers. max_jobs_run=200 @@ -110,14 +110,14 @@ if [ $stage -le 3 ]; then if [ -d ${srcdir}_degs/storage ]; then max_jobs=10; else max_jobs=5; fi steps/nnet2/get_egs_discriminative2.sh \ - --cmd "$decode_cmd -tc $max_jobs" \ + --cmd "$decode_cmd --max-jobs-run $max_jobs" \ --online-ivector-dir exp/nnet2_multicondition/ivectors_train \ --criterion $criterion --drop-frames $drop_frames \ data/train_rvb_hires data/lang ${srcdir}{_ali,_denlats,/final.mdl,_degs} || exit 1; # the command below is a more generic, but slower, way to do it. #steps/online/nnet2/get_egs_discriminative2.sh \ - # --cmd "$decode_cmd -tc $max_jobs" \ + # --cmd "$decode_cmd --max-jobs-run $max_jobs" \ # --criterion $criterion --drop-frames $drop_frames \ # data/train_960 data/lang ${srcdir}{_ali,_denlats,_online,_degs} || exit 1; fi diff --git a/egs/aspire/s5/local/nnet3/run_autoencoder.sh b/egs/aspire/s5/local/nnet3/run_autoencoder.sh new file mode 100644 index 00000000000..abc7f3a6234 --- /dev/null +++ b/egs/aspire/s5/local/nnet3/run_autoencoder.sh @@ -0,0 +1,88 @@ +#!/bin/bash + +# this is an example to show a "tdnn" system in raw nnet configuration +# i.e. without a transition model + +. cmd.sh + + +# At this script level we don't support not running on GPU, as it would be painfully slow. +# If you want to run without GPU you'd have to call train_tdnn.sh with --gpu false, +# --num-threads 16 and --minibatch-size 128. + +stage=0 +affix= +train_stage=-10 +common_egs_dir= +num_data_reps=10 + +remove_egs=true + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + + +if ! cuda-compiled; then + cat < $targets_scp +done + +if [ $stage -le 9 ]; then + echo "$0: creating neural net configs"; + + num_targets=`feat-to-dim scp:$targets_scp - 2>/dev/null` || exit 1 + + # create the config files for nnet initialization + python steps/nnet3/tdnn/make_configs.py \ + --splice-indexes "-2,-1,0,1,2 -1,2 -3,3 -7,2 0" \ + --feat-dir ${data_dir} \ + --relu-dim=1024 \ + --add-lda=false \ + --objective-type=quadratic \ + --add-final-sigmoid=false \ + --include-log-softmax=false \ + --use-presoftmax-prior-scale=false \ + --num-targets=$num_targets \ + $dir/configs || exit 1; +fi + +if [ $stage -le 10 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{3,4,5,6}/$USER/kaldi-data/egs/aspire-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/tdnn/train_raw_nnet.sh --stage $train_stage \ + --cmd "$decode_cmd" \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --num-epochs 2 \ + --num-jobs-initial 3 \ + --num-jobs-final 16 \ + --initial-effective-lrate 0.0017 \ + --final-effective-lrate 0.00017 \ + --egs-dir "$common_egs_dir" \ + --remove-egs $remove_egs \ + --use-gpu true \ + --dense-targets true \ + ${data_dir} $targets_scp $dir || exit 1 +fi + diff --git a/egs/aspire/s5/path.sh b/egs/aspire/s5/path.sh index e93eb33f24b..1a6fb5f891b 100755 --- a/egs/aspire/s5/path.sh +++ b/egs/aspire/s5/path.sh @@ -1,3 +1,5 @@ export KALDI_ROOT=`pwd`/../../.. -export PATH=$PWD/utils/:$KALDI_ROOT/src/bin:$KALDI_ROOT/tools/openfst/bin:$KALDI_ROOT/src/fstbin/:$KALDI_ROOT/src/gmmbin/:$KALDI_ROOT/src/featbin/:$KALDI_ROOT/src/lm/:$KALDI_ROOT/src/sgmmbin/:$KALDI_ROOT/src/sgmm2bin/:$KALDI_ROOT/src/fgmmbin/:$KALDI_ROOT/src/latbin/:$KALDI_ROOT/src/nnetbin:$KALDI_ROOT/src/nnet2bin:$KALDI_ROOT/src/online2bin/:$KALDI_ROOT/src/ivectorbin/:$KALDI_ROOT/src/lmbin:$PWD:$PATH +export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH +[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1 +. $KALDI_ROOT/tools/config/common_path.sh export LC_ALL=C diff --git a/egs/aurora4/s5/cmd.sh b/egs/aurora4/s5/cmd.sh index 139b2cd6c6c..378febca15b 100644 --- a/egs/aurora4/s5/cmd.sh +++ b/egs/aurora4/s5/cmd.sh @@ -1,29 +1,18 @@ -# "queue.pl" uses qsub. The options to it are -# options to qsub. If you have GridEngine installed, -# change this to a queue you have access to. -# Otherwise, use "run.pl", which will run jobs locally -# (make sure your --num-jobs options are no more than -# the number of cpus on your machine. +# you can change cmd.sh depending on what type of queue you are using. +# If you have no queueing system and want to run on a local machine, you +# can change all instances 'queue.pl' to run.pl (but be careful and run +# commands one by one: most recipes will exhaust the memory on your +# machine). queue.pl works with GridEngine (qsub). slurm.pl works +# with slurm. Different queues are configured differently, with different +# queue names and different ways of specifying things like memory; +# to account for these differences you can create and edit the file +# conf/queue.conf to match your queue's configuration. Search for +# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, +# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. -#a) JHU cluster options -export train_cmd="queue.pl -l arch=*64" -export decode_cmd="queue.pl -l arch=*64 --mem 2G" -export mkgraph_cmd="queue.pl -l arch=*64 --mem 4G" -export big_memory_cmd="queue.pl -l arch=*64 --mem 8G" +export train_cmd="queue.pl --mem 4G" +export decode_cmd="queue.pl --mem 4G" +export mkgraph_cmd="queue.pl --mem 8G" +# the use of cuda_cmd is deprecated but it's still used in some example scripts +# here. export cuda_cmd="queue.pl --gpu 1" - - -#b) BUT cluster options -#export train_cmd="queue.pl -q all.q@@blade -l ram_free=1200M,mem_free=1200M" -#export decode_cmd="queue.pl -q all.q@@blade -l ram_free=1700M,mem_free=1700M" -#export decodebig_cmd="queue.pl -q all.q@@blade -l ram_free=4G,mem_free=4G" - -#export cuda_cmd="queue.pl -q long.q@@pco203 -l gpu=1" -#export cuda_cmd="queue.pl -q long.q@pcspeech-gpu" -#export mkgraph_cmd="queue.pl -q all.q@@servers -l ram_free=4G,mem_free=4G" - -#c) run it locally... -#export train_cmd=run.pl -#export decode_cmd=run.pl -#export cuda_cmd=run.pl -#export mkgraph_cmd=run.pl diff --git a/egs/aurora4/s5/local/aurora4_format_data.sh b/egs/aurora4/s5/local/aurora4_format_data.sh index 4208c019879..0b94f7f796d 100755 --- a/egs/aurora4/s5/local/aurora4_format_data.sh +++ b/egs/aurora4/s5/local/aurora4_format_data.sh @@ -21,7 +21,7 @@ tmpdir=data/local/lm_tmp lexicon=data/local/lang_tmp/lexiconp.txt mkdir -p $tmpdir -for x in train_si84_clean train_si84_multi test_eval92 test_0166 dev_0330 dev_1206; do +for x in train_si84_clean train_si84_multi test_eval92 test_0166 dev_0330 dev_1206; do mkdir -p data/$x cp $srcdir/${x}_wav.scp data/$x/wav.scp || exit 1; cp $srcdir/$x.txt data/$x/text || exit 1; @@ -42,23 +42,9 @@ for lm_suffix in bg tgpr tg bg_5k tgpr_5k tg_5k; do cp -r data/lang/* $test gunzip -c $lmdir/lm_${lm_suffix}.arpa.gz | \ - utils/find_arpa_oovs.pl $test/words.txt > $tmpdir/oovs_${lm_suffix}.txt + arpa2fst --disambig-symbol=#0 \ + --read-symbol-table=$test/words.txt - $test/G.fst - # grep -v ' ' because the LM seems to have some strange and useless - # stuff in it with multiple 's in the history. Encountered some other similar - # things in a LM from Geoff. Removing all "illegal" combinations of and , - # which are supposed to occur only at being/end of utt. These can cause - # determinization failures of CLG [ends up being epsilon cycles]. - gunzip -c $lmdir/lm_${lm_suffix}.arpa.gz | \ - grep -v ' ' | \ - grep -v ' ' | \ - grep -v ' ' | \ - arpa2fst - | fstprint | \ - utils/remove_oovs.pl $tmpdir/oovs_${lm_suffix}.txt | \ - utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$test/words.txt \ - --osymbols=$test/words.txt --keep_isymbols=false --keep_osymbols=false | \ - fstrmepsilon | fstarcsort --sort_type=ilabel > $test/G.fst - utils/validate_lang.pl --skip-determinization-check $test || exit 1; done diff --git a/egs/aurora4/s5/path.sh b/egs/aurora4/s5/path.sh index fee0b9b0c11..2d17b17a84a 100755 --- a/egs/aurora4/s5/path.sh +++ b/egs/aurora4/s5/path.sh @@ -1,4 +1,6 @@ export KALDI_ROOT=`pwd`/../../.. [ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh -export PATH=$PWD/utils/:$KALDI_ROOT/src/bin:$KALDI_ROOT/tools/openfst/bin:$KALDI_ROOT/src/fstbin/:$KALDI_ROOT/src/gmmbin/:$KALDI_ROOT/src/featbin/:$KALDI_ROOT/src/lm/:$KALDI_ROOT/src/sgmmbin/:$KALDI_ROOT/src/sgmm2bin/:$KALDI_ROOT/src/fgmmbin/:$KALDI_ROOT/src/latbin/:$KALDI_ROOT/src/nnetbin:$KALDI_ROOT/src/nnet2bin/:$KALDI_ROOT/src/kwsbin:$PWD:$PATH +export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH +[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1 +. $KALDI_ROOT/tools/config/common_path.sh export LC_ALL=C diff --git a/egs/babel/s5/cmd.sh b/egs/babel/s5/cmd.sh index a4a11bef039..71dd849a93b 100644 --- a/egs/babel/s5/cmd.sh +++ b/egs/babel/s5/cmd.sh @@ -1,29 +1,15 @@ -# "queue.pl" uses qsub. The options to it are -# options to qsub. If you have GridEngine installed, -# change this to a queue you have access to. -# Otherwise, use "run.pl", which will run jobs locally -# (make sure your --num-jobs options are no more than -# the number of cpus on your machine. - -#a) JHU cluster options -export train_cmd="queue.pl -l arch=*64" -export decode_cmd="queue.pl -l arch=*64,mem_free=2G,ram_free=2G" -export mkgraph_cmd="queue.pl -l arch=*64,ram_free=4G,mem_free=4G" - -#export cuda_cmd="..." - - -#b) BUT cluster options -#export train_cmd="queue.pl -q all.q@@blade -l ram_free=1200M,mem_free=1200M" -#export decode_cmd="queue.pl -q all.q@@blade -l ram_free=1700M,mem_free=1700M" -#export decodebig_cmd="queue.pl -q all.q@@blade -l ram_free=4G,mem_free=4G" - -#export cuda_cmd="queue.pl -q long.q@@pco203 -l gpu=1" -#export cuda_cmd="queue.pl -q long.q@pcspeech-gpu" -#export mkgraph_cmd="queue.pl -q all.q@@servers -l ram_free=4G,mem_free=4G" - -#c) run it locally... -#export train_cmd=run.pl -#export decode_cmd=run.pl -#export cuda_cmd=run.pl -#export mkgraph_cmd=run.pl +# you can change cmd.sh depending on what type of queue you are using. +# If you have no queueing system and want to run on a local machine, you +# can change all instances 'queue.pl' to run.pl (but be careful and run +# commands one by one: most recipes will exhaust the memory on your +# machine). queue.pl works with GridEngine (qsub). slurm.pl works +# with slurm. Different queues are configured differently, with different +# queue names and different ways of specifying things like memory; +# to account for these differences you can create and edit the file +# conf/queue.conf to match your queue's configuration. Search for +# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, +# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. + +export train_cmd="queue.pl --mem 2G" +export decode_cmd="queue.pl --mem 4G" +export mkgraph_cmd="queue.pl --mem 8G" diff --git a/egs/babel/s5/local/arpa2G.sh b/egs/babel/s5/local/arpa2G.sh index 67d44080fe4..f037caf0d7b 100755 --- a/egs/babel/s5/local/arpa2G.sh +++ b/egs/babel/s5/local/arpa2G.sh @@ -39,14 +39,8 @@ destdir=$3 mkdir $destdir 2>/dev/null || true gunzip -c $lmfile | \ - grep -v ' ' | grep -v ' ' | grep -v ' ' | \ - arpa2fst - | \ - fstprint | \ - utils/eps2disambig.pl | \ - utils/s2eps.pl | \ - fstcompile --isymbols=$langdir/words.txt \ - --osymbols=$langdir/words.txt --keep_isymbols=false --keep_osymbols=false | \ - fstrmepsilon | fstarcsort --sort_type=ilabel > $destdir/G.fst || exit 1 + arpa2fst --disambig-symbol=#0 \ + --read-symbol-table=$langdir/words.txt - $destdir/G.fst || exit 1 fstisstochastic $destdir/G.fst || true exit 0 diff --git a/egs/babel/s5/local/arpa2G_syllables.sh b/egs/babel/s5/local/arpa2G_syllables.sh index 8f10f87f019..58ef162ec2e 100755 --- a/egs/babel/s5/local/arpa2G_syllables.sh +++ b/egs/babel/s5/local/arpa2G_syllables.sh @@ -22,7 +22,7 @@ silence_id=`grep -w SIL $langdir/words.txt | awk '{print $2}'` || exit 1; [ -z $silence_id ] && echo Error getting silence-id from $langdir/words.txt && exit 1; rho=$[$last_id+1] -# state 0 is start-state. state 1 is state after we saw silence. state 2 is +# state 0 is start-state. state 1 is state after we saw silence. state 2 is # "dead state/failure state" that is not coaccessible. cat < $destdir/rho.fst 0 1 $silence_id $silence_id @@ -35,16 +35,11 @@ EOF gunzip -c $lmfile | \ - grep -v ' ' | grep -v ' ' | grep -v ' ' | \ sed 's///g' | \ - arpa2fst - | \ - fstprint | \ - utils/eps2disambig.pl | \ - utils/s2eps.pl | \ - fstcompile --isymbols=$langdir/words.txt \ - --osymbols=$langdir/words.txt --keep_isymbols=false --keep_osymbols=false | \ + arpa2fst --disambig-symbol=#0 --ilabel-sort=false \ + --read-symbol-table=$langdir/words.txt - | \ fstrhocompose "$rho" - $destdir/rho.fst | \ - fstrmepsilon | fstarcsort --sort_type=ilabel > $destdir/G.fst || exit 1 + fstarcsort --sort_type=ilabel > $destdir/G.fst || exit 1 fstisstochastic $destdir/G.fst || true diff --git a/egs/babel/s5/local/prepare_kaldi_lm_from_training_text.sh b/egs/babel/s5/local/prepare_kaldi_lm_from_training_text.sh index 1837902a7d0..a5601130343 100755 --- a/egs/babel/s5/local/prepare_kaldi_lm_from_training_text.sh +++ b/egs/babel/s5/local/prepare_kaldi_lm_from_training_text.sh @@ -8,7 +8,7 @@ # This script trains LMs on the WSJ LM-training data. # It requires that you have already run wsj_extend_dict.sh, # to get the larger-size dictionary including all of CMUdict -# plus any OOVs and possible acronyms that we could easily +# plus any OOVs and possible acronyms that we could easily # derive pronunciations for. # This script takes as command-line arguments the relevant data/lang @@ -69,7 +69,7 @@ cat $data/text | awk '{for (n=2;n $dir/train_in.gz || exit 1; # Get training data with OOV words (w.r.t. our current vocab) replaced with . -echo "Getting training data with OOV words replaced with (train_nounk.gz)" +echo "Getting training data with OOV words replaced with (train_nounk.gz)" gunzip -c $dir/train_in.gz | awk -v w=$dir/wordlist \ 'BEGIN{while((getline0) v[$1]=1;} {for (i=1;i<=NF;i++) if ($i in v) printf $i" ";else printf " ";print ""}'|sed 's/ $//g' \ @@ -93,7 +93,7 @@ gunzip -c $dir/train_nounk.gz | awk -v wmap=$dir/word_map 'BEGIN{while((getline< # To save disk space, remove the un-mapped training data. We could # easily generate it again if needed. -rm $dir/train_nounk.gz +rm $dir/train_nounk.gz ################################################################## @@ -177,7 +177,7 @@ prune_lm.sh --arpa 5.0 $dir/4gram # The default LM chosen to be the last pruned 4gram-mincount # # Note: One can cheat and provide an external ARPA LM here!!! -# To do so, make sure that +# To do so, make sure that # -- its vocabulary is fully covered by $lang/words.txt, # -- it is gzipped and # -- it is placed in the $dir directory. @@ -205,14 +205,9 @@ echo "Compiling $gzipped_ARPA_LM into $lang/G.fst" . ./path.sh || exit 1; gunzip -c $gzipped_ARPA_LM | \ - grep -v ' ' | \ - grep -v ' ' | \ - grep -v ' ' | \ - arpa2fst - | fstprint | \ - utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$lang/words.txt \ - --osymbols=$lang/words.txt --keep_isymbols=false --keep_osymbols=false | \ - fstrmepsilon | fstarcsort --sort_type=ilabel > $lang/G.fst || exit 1; - fstisstochastic $lang/G.fst + arpa2fst --disambig-symbol=#0 \ + --read-symbol-table=$lang/words.txt - $lang/G.fst || exit 1; +fstisstochastic $lang/G.fst ################################################################## # Redo the FST step after reviewing perplexities reported by the @@ -220,4 +215,3 @@ gunzip -c $gzipped_ARPA_LM | \ ################################################################## exit 0 - diff --git a/egs/babel/s5b/cmd.sh b/egs/babel/s5b/cmd.sh index a4a11bef039..88db78823a5 100644 --- a/egs/babel/s5b/cmd.sh +++ b/egs/babel/s5b/cmd.sh @@ -1,29 +1,15 @@ -# "queue.pl" uses qsub. The options to it are -# options to qsub. If you have GridEngine installed, -# change this to a queue you have access to. -# Otherwise, use "run.pl", which will run jobs locally -# (make sure your --num-jobs options are no more than -# the number of cpus on your machine. - -#a) JHU cluster options -export train_cmd="queue.pl -l arch=*64" -export decode_cmd="queue.pl -l arch=*64,mem_free=2G,ram_free=2G" -export mkgraph_cmd="queue.pl -l arch=*64,ram_free=4G,mem_free=4G" - -#export cuda_cmd="..." - - -#b) BUT cluster options -#export train_cmd="queue.pl -q all.q@@blade -l ram_free=1200M,mem_free=1200M" -#export decode_cmd="queue.pl -q all.q@@blade -l ram_free=1700M,mem_free=1700M" -#export decodebig_cmd="queue.pl -q all.q@@blade -l ram_free=4G,mem_free=4G" - -#export cuda_cmd="queue.pl -q long.q@@pco203 -l gpu=1" -#export cuda_cmd="queue.pl -q long.q@pcspeech-gpu" -#export mkgraph_cmd="queue.pl -q all.q@@servers -l ram_free=4G,mem_free=4G" - -#c) run it locally... -#export train_cmd=run.pl -#export decode_cmd=run.pl -#export cuda_cmd=run.pl -#export mkgraph_cmd=run.pl +# you can change cmd.sh depending on what type of queue you are using. +# If you have no queueing system and want to run on a local machine, you +# can change all instances 'queue.pl' to run.pl (but be careful and run +# commands one by one: most recipes will exhaust the memory on your +# machine). queue.pl works with GridEngine (qsub). slurm.pl works +# with slurm. Different queues are configured differently, with different +# queue names and different ways of specifying things like memory; +# to account for these differences you can create and edit the file +# conf/queue.conf to match your queue's configuration. Search for +# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, +# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. + +export train_cmd="queue.pl --mem 4G" +export decode_cmd="queue.pl --mem 4G" +export mkgraph_cmd="queue.pl --mem 8G" diff --git a/egs/babel/s5b/local/arpa2G.sh b/egs/babel/s5b/local/arpa2G.sh index 83f789e999f..db816abc7a5 100755 --- a/egs/babel/s5b/local/arpa2G.sh +++ b/egs/babel/s5b/local/arpa2G.sh @@ -38,7 +38,7 @@ if [ $# -ne 3 ]; then fi set -e #Exit on non-zero return code from any command -set -o pipefail #Exit if any of the commands in the pipeline will +set -o pipefail #Exit if any of the commands in the pipeline will #return non-zero return code lmfile=$1 @@ -58,7 +58,7 @@ if [ ! -z "$oov_prob_file" ]; then exit 1; fi - min_prob=$(gunzip -c $lmfile | perl -e ' $minlogprob = 0.0; + min_prob=$(gunzip -c $lmfile | perl -e ' $minlogprob = 0.0; while() { if (m/\\(\d)-grams:/) { $order = $1; } if ($order == 1) { @A = split; if ($A[0] < $minlogprob && $A[0] != -99) { $minlogprob = $A[0]; }}} print $minlogprob') @@ -75,7 +75,7 @@ if [ ! -z "$oov_prob_file" ]; then while() { if (m/^ngram 1=(\d+)/) { $n = $1 + $num_oovs; print "ngram 1=$n\n"; } else { print; } # print all lines unchanged except the one that says ngram 1=X. - if (m/^\\1-grams:$/) { + if (m/^\\1-grams:$/) { foreach $l (@OOVS) { @A = split(" ", $l); @A == 2 || die "bad line in oov2prob: $_;"; @@ -96,16 +96,11 @@ elif [[ $lmfile == *.gz ]] ; then else decompress="cat $lmfile" fi - + $decompress | \ - grep -v ' ' | grep -v ' ' | grep -v ' ' | \ - arpa2fst - | \ - fstprint | \ - utils/eps2disambig.pl | \ - utils/s2eps.pl | \ - fstcompile --isymbols=$langdir/words.txt \ - --osymbols=$langdir/words.txt --keep_isymbols=false --keep_osymbols=false | \ - fstrmepsilon | fstarcsort --sort_type=olabel > $destdir/G.fst || exit 1 + arpa2fst --disambig-symbol=#0 \ + --read-symbol-table=$langdir/words.txt - $destdir/G.fst || exit 1 + fstisstochastic $destdir/G.fst || true; if $cleanup; then diff --git a/egs/babel/s5b/local/arpa2G_syllables.sh b/egs/babel/s5b/local/arpa2G_syllables.sh index 8147a6bb38b..58ef162ec2e 100755 --- a/egs/babel/s5b/local/arpa2G_syllables.sh +++ b/egs/babel/s5b/local/arpa2G_syllables.sh @@ -22,7 +22,7 @@ silence_id=`grep -w SIL $langdir/words.txt | awk '{print $2}'` || exit 1; [ -z $silence_id ] && echo Error getting silence-id from $langdir/words.txt && exit 1; rho=$[$last_id+1] -# state 0 is start-state. state 1 is state after we saw silence. state 2 is +# state 0 is start-state. state 1 is state after we saw silence. state 2 is # "dead state/failure state" that is not coaccessible. cat < $destdir/rho.fst 0 1 $silence_id $silence_id @@ -35,16 +35,11 @@ EOF gunzip -c $lmfile | \ - grep -v ' ' | grep -v ' ' | grep -v ' ' | \ sed 's///g' | \ - arpa2fst - | \ - fstprint | \ - utils/eps2disambig.pl | \ - utils/s2eps.pl | \ - fstcompile --isymbols=$langdir/words.txt \ - --osymbols=$langdir/words.txt --keep_isymbols=false --keep_osymbols=false | \ + arpa2fst --disambig-symbol=#0 --ilabel-sort=false \ + --read-symbol-table=$langdir/words.txt - | \ fstrhocompose "$rho" - $destdir/rho.fst | \ - fstrmepsilon > $destdir/G.fst || exit 1 + fstarcsort --sort_type=ilabel > $destdir/G.fst || exit 1 fstisstochastic $destdir/G.fst || true diff --git a/egs/babel/s5b/local/nnet2/get_egs_semi_supervised.sh b/egs/babel/s5b/local/nnet2/get_egs_semi_supervised.sh index fcf67514396..760d7ee80d5 100755 --- a/egs/babel/s5b/local/nnet2/get_egs_semi_supervised.sh +++ b/egs/babel/s5b/local/nnet2/get_egs_semi_supervised.sh @@ -28,7 +28,7 @@ transform_dir_sup= # If supplied, overrides alidir transform_dir_unsup= num_jobs_nnet=16 # Number of neural net jobs to run in parallel stage=-10 -io_opts="-tc 5" # for jobs with a lot of I/O, limits the number running at one time. +io_opts="--max-jobs-run 5" # for jobs with a lot of I/O, limits the number running at one time. splice_width=4 # meaning +- 4 frames on each side for second LDA spk_vecs_dir_sup= spk_vecs_dir_unsup= diff --git a/egs/babel/s5b/local/prepare_kaldi_lm_from_training_text.sh b/egs/babel/s5b/local/prepare_kaldi_lm_from_training_text.sh index 79a1bbd2263..79bd348bf75 100755 --- a/egs/babel/s5b/local/prepare_kaldi_lm_from_training_text.sh +++ b/egs/babel/s5b/local/prepare_kaldi_lm_from_training_text.sh @@ -8,7 +8,7 @@ # This script trains LMs on the WSJ LM-training data. # It requires that you have already run wsj_extend_dict.sh, # to get the larger-size dictionary including all of CMUdict -# plus any OOVs and possible acronyms that we could easily +# plus any OOVs and possible acronyms that we could easily # derive pronunciations for. # This script takes as command-line arguments the relevant data/lang @@ -69,7 +69,7 @@ cat $data/text | awk '{for (n=2;n $dir/train_in.gz || exit 1; # Get training data with OOV words (w.r.t. our current vocab) replaced with . -echo "Getting training data with OOV words replaced with (train_nounk.gz)" +echo "Getting training data with OOV words replaced with (train_nounk.gz)" gunzip -c $dir/train_in.gz | awk -v w=$dir/wordlist \ 'BEGIN{while((getline0) v[$1]=1;} {for (i=1;i<=NF;i++) if ($i in v) printf $i" ";else printf " ";print ""}'|sed 's/ $//g' \ @@ -93,7 +93,7 @@ gunzip -c $dir/train_nounk.gz | awk -v wmap=$dir/word_map 'BEGIN{while((getline< # To save disk space, remove the un-mapped training data. We could # easily generate it again if needed. -rm $dir/train_nounk.gz +rm $dir/train_nounk.gz ################################################################## @@ -177,7 +177,7 @@ prune_lm.sh --arpa 5.0 $dir/4gram # The default LM chosen to be the last pruned 4gram-mincount # # Note: One can cheat and provide an external ARPA LM here!!! -# To do so, make sure that +# To do so, make sure that # -- its vocabulary is fully covered by $lang/words.txt, # -- it is gzipped and # -- it is placed in the $dir directory. @@ -205,14 +205,9 @@ echo "Compiling $gzipped_ARPA_LM into $lang/G.fst" . ./path.sh || exit 1; gunzip -c $gzipped_ARPA_LM | \ - grep -v ' ' | \ - grep -v ' ' | \ - grep -v ' ' | \ - arpa2fst - | fstprint | \ - utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$lang/words.txt \ - --osymbols=$lang/words.txt --keep_isymbols=false --keep_osymbols=false | \ - fstrmepsilon > $lang/G.fst || exit 1; - fstisstochastic $lang/G.fst + arpa2fst --disambig-symbol=#0 \ + --read-symbol-table=$lang/words.txt - $lang/G.fst || exit 1; +fstisstochastic $lang/G.fst ################################################################## # Redo the FST step after reviewing perplexities reported by the @@ -220,4 +215,3 @@ gunzip -c $gzipped_ARPA_LM | \ ################################################################## exit 0 - diff --git a/egs/babel/s5c/cmd.sh b/egs/babel/s5c/cmd.sh index a4a11bef039..71dd849a93b 100644 --- a/egs/babel/s5c/cmd.sh +++ b/egs/babel/s5c/cmd.sh @@ -1,29 +1,15 @@ -# "queue.pl" uses qsub. The options to it are -# options to qsub. If you have GridEngine installed, -# change this to a queue you have access to. -# Otherwise, use "run.pl", which will run jobs locally -# (make sure your --num-jobs options are no more than -# the number of cpus on your machine. - -#a) JHU cluster options -export train_cmd="queue.pl -l arch=*64" -export decode_cmd="queue.pl -l arch=*64,mem_free=2G,ram_free=2G" -export mkgraph_cmd="queue.pl -l arch=*64,ram_free=4G,mem_free=4G" - -#export cuda_cmd="..." - - -#b) BUT cluster options -#export train_cmd="queue.pl -q all.q@@blade -l ram_free=1200M,mem_free=1200M" -#export decode_cmd="queue.pl -q all.q@@blade -l ram_free=1700M,mem_free=1700M" -#export decodebig_cmd="queue.pl -q all.q@@blade -l ram_free=4G,mem_free=4G" - -#export cuda_cmd="queue.pl -q long.q@@pco203 -l gpu=1" -#export cuda_cmd="queue.pl -q long.q@pcspeech-gpu" -#export mkgraph_cmd="queue.pl -q all.q@@servers -l ram_free=4G,mem_free=4G" - -#c) run it locally... -#export train_cmd=run.pl -#export decode_cmd=run.pl -#export cuda_cmd=run.pl -#export mkgraph_cmd=run.pl +# you can change cmd.sh depending on what type of queue you are using. +# If you have no queueing system and want to run on a local machine, you +# can change all instances 'queue.pl' to run.pl (but be careful and run +# commands one by one: most recipes will exhaust the memory on your +# machine). queue.pl works with GridEngine (qsub). slurm.pl works +# with slurm. Different queues are configured differently, with different +# queue names and different ways of specifying things like memory; +# to account for these differences you can create and edit the file +# conf/queue.conf to match your queue's configuration. Search for +# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, +# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. + +export train_cmd="queue.pl --mem 2G" +export decode_cmd="queue.pl --mem 4G" +export mkgraph_cmd="queue.pl --mem 8G" diff --git a/egs/babel/s5c/conf/lang/107-vietnamese-fullLP.official.conf b/egs/babel/s5c/conf/lang/107-vietnamese-fullLP.official.conf index 8b09764e45a..13bac7586a1 100644 --- a/egs/babel/s5c/conf/lang/107-vietnamese-fullLP.official.conf +++ b/egs/babel/s5c/conf/lang/107-vietnamese-fullLP.official.conf @@ -3,7 +3,7 @@ #speech corpora files location train_data_dir=/export/babel/data/107-vietnamese/release-current/conversational/training/ -train_data_list=/export/babel/data/splits/Vietnamese_Babel107/train.fullLP.list +train_data_list=/export/babel/data/splits/Vietnamese_Babel107/train.FullLP.list train_nj=32 #RADICAL DEV data files diff --git a/egs/babel/s5c/local/arpa2G.sh b/egs/babel/s5c/local/arpa2G.sh index 83f789e999f..db816abc7a5 100755 --- a/egs/babel/s5c/local/arpa2G.sh +++ b/egs/babel/s5c/local/arpa2G.sh @@ -38,7 +38,7 @@ if [ $# -ne 3 ]; then fi set -e #Exit on non-zero return code from any command -set -o pipefail #Exit if any of the commands in the pipeline will +set -o pipefail #Exit if any of the commands in the pipeline will #return non-zero return code lmfile=$1 @@ -58,7 +58,7 @@ if [ ! -z "$oov_prob_file" ]; then exit 1; fi - min_prob=$(gunzip -c $lmfile | perl -e ' $minlogprob = 0.0; + min_prob=$(gunzip -c $lmfile | perl -e ' $minlogprob = 0.0; while() { if (m/\\(\d)-grams:/) { $order = $1; } if ($order == 1) { @A = split; if ($A[0] < $minlogprob && $A[0] != -99) { $minlogprob = $A[0]; }}} print $minlogprob') @@ -75,7 +75,7 @@ if [ ! -z "$oov_prob_file" ]; then while() { if (m/^ngram 1=(\d+)/) { $n = $1 + $num_oovs; print "ngram 1=$n\n"; } else { print; } # print all lines unchanged except the one that says ngram 1=X. - if (m/^\\1-grams:$/) { + if (m/^\\1-grams:$/) { foreach $l (@OOVS) { @A = split(" ", $l); @A == 2 || die "bad line in oov2prob: $_;"; @@ -96,16 +96,11 @@ elif [[ $lmfile == *.gz ]] ; then else decompress="cat $lmfile" fi - + $decompress | \ - grep -v ' ' | grep -v ' ' | grep -v ' ' | \ - arpa2fst - | \ - fstprint | \ - utils/eps2disambig.pl | \ - utils/s2eps.pl | \ - fstcompile --isymbols=$langdir/words.txt \ - --osymbols=$langdir/words.txt --keep_isymbols=false --keep_osymbols=false | \ - fstrmepsilon | fstarcsort --sort_type=olabel > $destdir/G.fst || exit 1 + arpa2fst --disambig-symbol=#0 \ + --read-symbol-table=$langdir/words.txt - $destdir/G.fst || exit 1 + fstisstochastic $destdir/G.fst || true; if $cleanup; then diff --git a/egs/babel/s5c/local/arpa2G_syllables.sh b/egs/babel/s5c/local/arpa2G_syllables.sh index 8f10f87f019..58ef162ec2e 100755 --- a/egs/babel/s5c/local/arpa2G_syllables.sh +++ b/egs/babel/s5c/local/arpa2G_syllables.sh @@ -22,7 +22,7 @@ silence_id=`grep -w SIL $langdir/words.txt | awk '{print $2}'` || exit 1; [ -z $silence_id ] && echo Error getting silence-id from $langdir/words.txt && exit 1; rho=$[$last_id+1] -# state 0 is start-state. state 1 is state after we saw silence. state 2 is +# state 0 is start-state. state 1 is state after we saw silence. state 2 is # "dead state/failure state" that is not coaccessible. cat < $destdir/rho.fst 0 1 $silence_id $silence_id @@ -35,16 +35,11 @@ EOF gunzip -c $lmfile | \ - grep -v ' ' | grep -v ' ' | grep -v ' ' | \ sed 's///g' | \ - arpa2fst - | \ - fstprint | \ - utils/eps2disambig.pl | \ - utils/s2eps.pl | \ - fstcompile --isymbols=$langdir/words.txt \ - --osymbols=$langdir/words.txt --keep_isymbols=false --keep_osymbols=false | \ + arpa2fst --disambig-symbol=#0 --ilabel-sort=false \ + --read-symbol-table=$langdir/words.txt - | \ fstrhocompose "$rho" - $destdir/rho.fst | \ - fstrmepsilon | fstarcsort --sort_type=ilabel > $destdir/G.fst || exit 1 + fstarcsort --sort_type=ilabel > $destdir/G.fst || exit 1 fstisstochastic $destdir/G.fst || true diff --git a/egs/babel/s5c/local/datasets/extra_kws.sh b/egs/babel/s5c/local/datasets/extra_kws.sh index a84ebc7deb1..cb90968a1dc 100644 --- a/egs/babel/s5c/local/datasets/extra_kws.sh +++ b/egs/babel/s5c/local/datasets/extra_kws.sh @@ -60,7 +60,7 @@ function setup_oov_search { #instead of search collection dependent if [ ! -f exp/conf_matrix/.done ] ; then local/generate_confusion_matrix.sh --cmd "$decode_cmd" --nj $my_nj \ - exp/sgmm5_denlats/dengraph exp/sgmm5 exp/sgmm5_ali exp/sgmm5_denlats exp/conf_matrix + exp/sgmm5_denlats/dengraph exp/sgmm5 exp/sgmm5_ali exp/sgmm5_denlats exp/conf_matrix || return 1 touch exp/conf_matrix/.done fi confusion=exp/conf_matrix/confusions.txt diff --git a/egs/babel/s5c/local/extend_lexicon.sh b/egs/babel/s5c/local/extend_lexicon.sh index 18c69415ed4..fd0b27a4172 100755 --- a/egs/babel/s5c/local/extend_lexicon.sh +++ b/egs/babel/s5c/local/extend_lexicon.sh @@ -2,6 +2,7 @@ # Copyright 2014 Johns Hopkins University (authors: Daniel Povey, Yenda Trmal) # 2014 Guoguo Chen +# 2015 MIT Lincoln Labs (author: Fred Richardson) # Apache 2.0. # This script takes an input lexicon (e.g. lexicon.txt) and generates likely @@ -351,7 +352,17 @@ if [ $stage -le $g2p_iters ]; then g2p.py -V $var_mass --variants-number $var_counts --encoding $encoding \ --model $dir/p2g.model.final --apply - \ \> $dir/p2g_output.JOB || exit 1; - cat $dir/p2g_output.* > $dir/p2g_output + perl -wlne 'use strict; + our %P; + my ($prn,$num,$prb,$spl)=m/^(\S+)\s+(\S+)\s+(\S+)\s+(.*)$/; + my $tok=$prn."=".$spl; + $P{$tok} = [ $num, $prb ] unless (defined($P{$tok}) && $P{$tok}[1] < $prb); + END { + map{ my ($prn,$spl)=m/^(.*)=(.*)$/; + my ($num, $prb) = @{$P{$tok}}; + print join("\t",$prn,$num,$prb,$spl) + } sort keys %P + }' $dir/p2g_output.* > $dir/p2g_output rm $dir/p2g_output.* fi diff --git a/egs/babel/s5c/local/generate_confusion_matrix.sh b/egs/babel/s5c/local/generate_confusion_matrix.sh index 6529057db9e..4bcbacb5ae9 100755 --- a/egs/babel/s5c/local/generate_confusion_matrix.sh +++ b/egs/babel/s5c/local/generate_confusion_matrix.sh @@ -37,6 +37,7 @@ fi set -u set -e +set -o pipefail data=$1; shift modeldir=$1; shift @@ -64,7 +65,7 @@ cat $data/phones.txt | sed 's/_[B|E|I|S]//g' |\ echo "Converting alignments to phone sequences..." $cmd JOB=1:$nj $wdir/log/ali_to_phones.JOB.log \ - compute-wer --text --mode=all\ + align-text\ ark:\<\( \ ali-to-phones $model ark:"gunzip -c $alidir/ali.JOB.gz|" ark,t:- \|\ int2sym.pl -f 2- $wdir/phones.txt - \) \ @@ -72,7 +73,7 @@ $cmd JOB=1:$nj $wdir/log/ali_to_phones.JOB.log \ lattice-to-phone-lattice $model ark:"gunzip -c $latdir/lat.JOB.gz|" ark:- \| \ lattice-best-path --acoustic-scale=$acwt ark:- ark,t:- ark:/dev/null \| \ int2sym.pl -f 2- $wdir/phones.txt - \) \ - $wdir/confusions.JOB.txt + ark:$wdir/confusions.JOB.txt confusion_files="" for i in `seq 1 $nj` ; do @@ -80,23 +81,12 @@ for i in `seq 1 $nj` ; do done echo "Converting statistics..." -cat $confusion_files | sort | uniq -c | grep -v -E '|||SIL' | \ +cat $confusion_files | cut -f 2- -d ' ' | sed 's/ *; */\n/g'| sort | uniq -c | \ + grep -v -E '|||SIL' | \ perl -ane ' - if ($F[1] eq "correct") { - die "Unknown format " . join(" ", @F) . "\n" if ($#F != 2); - print "$F[2] $F[2] $F[0]\n"; - } elsif ($F[1] eq "deletion" ) { - die "Unknown format " . join(" ", @F) . "\n" if ($#F != 2); - print "$F[2] $F[0]\n"; - } elsif ($F[1] eq "insertion") { - die "Unknown format " . join(" ", @F) . "\n" if ($#F != 2); - print " $F[2] $F[0]\n"; - } elsif ($F[1] eq "substitution") { - die "Unknown format " . join(" ", @F) . "\n" if ($#F != 3); - print "$F[2] $F[3] $F[0]\n"; - } else { - die "Unknown line " . join(" ", @F). "\n"; - }' > $wdir/confusions.txt + die unless scalar @F == 3; + print "$F[1] $F[2] $F[0]\n"; + ' > $wdir/confusions.txt exit 0 #-echo "Converting alignments to phone sequences..." diff --git a/egs/babel/s5c/local/nnet2/get_egs_semi_supervised.sh b/egs/babel/s5c/local/nnet2/get_egs_semi_supervised.sh index fcf67514396..760d7ee80d5 100755 --- a/egs/babel/s5c/local/nnet2/get_egs_semi_supervised.sh +++ b/egs/babel/s5c/local/nnet2/get_egs_semi_supervised.sh @@ -28,7 +28,7 @@ transform_dir_sup= # If supplied, overrides alidir transform_dir_unsup= num_jobs_nnet=16 # Number of neural net jobs to run in parallel stage=-10 -io_opts="-tc 5" # for jobs with a lot of I/O, limits the number running at one time. +io_opts="--max-jobs-run 5" # for jobs with a lot of I/O, limits the number running at one time. splice_width=4 # meaning +- 4 frames on each side for second LDA spk_vecs_dir_sup= spk_vecs_dir_unsup= diff --git a/egs/babel/s5c/local/prepare_kaldi_lm_from_training_text.sh b/egs/babel/s5c/local/prepare_kaldi_lm_from_training_text.sh index d69bf3338f6..79bd348bf75 100755 --- a/egs/babel/s5c/local/prepare_kaldi_lm_from_training_text.sh +++ b/egs/babel/s5c/local/prepare_kaldi_lm_from_training_text.sh @@ -8,7 +8,7 @@ # This script trains LMs on the WSJ LM-training data. # It requires that you have already run wsj_extend_dict.sh, # to get the larger-size dictionary including all of CMUdict -# plus any OOVs and possible acronyms that we could easily +# plus any OOVs and possible acronyms that we could easily # derive pronunciations for. # This script takes as command-line arguments the relevant data/lang @@ -69,7 +69,7 @@ cat $data/text | awk '{for (n=2;n $dir/train_in.gz || exit 1; # Get training data with OOV words (w.r.t. our current vocab) replaced with . -echo "Getting training data with OOV words replaced with (train_nounk.gz)" +echo "Getting training data with OOV words replaced with (train_nounk.gz)" gunzip -c $dir/train_in.gz | awk -v w=$dir/wordlist \ 'BEGIN{while((getline0) v[$1]=1;} {for (i=1;i<=NF;i++) if ($i in v) printf $i" ";else printf " ";print ""}'|sed 's/ $//g' \ @@ -93,7 +93,7 @@ gunzip -c $dir/train_nounk.gz | awk -v wmap=$dir/word_map 'BEGIN{while((getline< # To save disk space, remove the un-mapped training data. We could # easily generate it again if needed. -rm $dir/train_nounk.gz +rm $dir/train_nounk.gz ################################################################## @@ -177,7 +177,7 @@ prune_lm.sh --arpa 5.0 $dir/4gram # The default LM chosen to be the last pruned 4gram-mincount # # Note: One can cheat and provide an external ARPA LM here!!! -# To do so, make sure that +# To do so, make sure that # -- its vocabulary is fully covered by $lang/words.txt, # -- it is gzipped and # -- it is placed in the $dir directory. @@ -205,14 +205,9 @@ echo "Compiling $gzipped_ARPA_LM into $lang/G.fst" . ./path.sh || exit 1; gunzip -c $gzipped_ARPA_LM | \ - grep -v ' ' | \ - grep -v ' ' | \ - grep -v ' ' | \ - arpa2fst - | fstprint | \ - utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$lang/words.txt \ - --osymbols=$lang/words.txt --keep_isymbols=false --keep_osymbols=false | \ - fstrmepsilon | fstarcsort --sort_type=ilabel > $lang/G.fst || exit 1; - fstisstochastic $lang/G.fst + arpa2fst --disambig-symbol=#0 \ + --read-symbol-table=$lang/words.txt - $lang/G.fst || exit 1; +fstisstochastic $lang/G.fst ################################################################## # Redo the FST step after reviewing perplexities reported by the @@ -220,4 +215,3 @@ gunzip -c $gzipped_ARPA_LM | \ ################################################################## exit 0 - diff --git a/egs/babel/s5c/run-4-anydecode.sh b/egs/babel/s5c/run-4-anydecode.sh index a1b943dd35e..68b87ea1e27 100755 --- a/egs/babel/s5c/run-4-anydecode.sh +++ b/egs/babel/s5c/run-4-anydecode.sh @@ -188,11 +188,11 @@ echo --------------------------------------------------------------------- if [ ! -f $dataset_dir/.done ] ; then if [ "$dataset_kind" == "supervised" ]; then if [ "$dataset_segments" == "seg" ]; then - . ./local/datasets/supervised_seg.sh + . ./local/datasets/supervised_seg.sh || exit 1 elif [ "$dataset_segments" == "uem" ]; then - . ./local/datasets/supervised_uem.sh + . ./local/datasets/supervised_uem.sh || exit 1 elif [ "$dataset_segments" == "pem" ]; then - . ./local/datasets/supervised_pem.sh + . ./local/datasets/supervised_pem.sh || exit 1 else echo "Unknown type of the dataset: \"$dataset_segments\"!"; echo "Valid dataset types are: seg, uem, pem"; @@ -241,12 +241,12 @@ echo --------------------------------------------------------------------- echo "Preparing kws data files in ${dataset_dir} on" `date` echo --------------------------------------------------------------------- if ! $skip_kws ; then - . ./local/datasets/basic_kws.sh + . ./local/datasets/basic_kws.sh || exit 1 if $extra_kws ; then - . ./local/datasets/extra_kws.sh + . ./local/datasets/extra_kws.sh || exit 1 fi if $vocab_kws ; then - . ./local/datasets/vocab_kws.sh + . ./local/datasets/vocab_kws.sh || exit 1 fi fi diff --git a/egs/bn_music_speech/v1/README b/egs/bn_music_speech/v1/README new file mode 100644 index 00000000000..8a8ae65108d --- /dev/null +++ b/egs/bn_music_speech/v1/README @@ -0,0 +1,6 @@ + The MUSAN corpus is required for system training. It is available at: + http://www.openslr.org/17/ + + The test requires Broadcast News data. The LDC Catalog numbers are: + Speech LDC97S44 + Transcripts LDC97T22 diff --git a/egs/bn_music_speech/v1/cmd.sh b/egs/bn_music_speech/v1/cmd.sh new file mode 100755 index 00000000000..d1ca1a6d126 --- /dev/null +++ b/egs/bn_music_speech/v1/cmd.sh @@ -0,0 +1,15 @@ +# you can change cmd.sh depending on what type of queue you are using. +# If you have no queueing system and want to run on a local machine, you +# can change all instances 'queue.pl' to run.pl (but be careful and run +# commands one by one: most recipes will exhaust the memory on your +# machine). queue.pl works with GridEngine (qsub). slurm.pl works +# with slurm. Different queues are configured differently, with different +# queue names and different ways of specifying things like memory; +# to account for these differences you can create and edit the file +# conf/queue.conf to match your queue's configuration. Search for +# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, +# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. + +export train_cmd="queue.pl --mem 4G" + + diff --git a/egs/bn_music_speech/v1/conf/merge_vad_map.txt b/egs/bn_music_speech/v1/conf/merge_vad_map.txt new file mode 100644 index 00000000000..216dee78b65 --- /dev/null +++ b/egs/bn_music_speech/v1/conf/merge_vad_map.txt @@ -0,0 +1,16 @@ +# This table defines the mapping used by the binary merge-vads to +# combine the output of compute-vad and compute-vad-from-frame-likes. +# The first column corresponds to VAD decisions from compute-vad +# and the second corresponds to VAD decisions from +# compute-vad-from-frame-likes. The labels "0" and "1" in the +# first column represent (approximately) silence and nonsilence +# respectively. The labels "0," "1," and "2" in the second column +# represent noise, speech, and music, respectively. The third +# column lists the resulting output labels: "0," "1," and "2" +# corresponding to silence/noise, speech, and music. +0 0 0 +1 0 0 +0 1 0 +1 1 1 +0 2 0 +1 2 2 diff --git a/egs/bn_music_speech/v1/conf/mfcc.conf b/egs/bn_music_speech/v1/conf/mfcc.conf new file mode 100644 index 00000000000..a4be40be454 --- /dev/null +++ b/egs/bn_music_speech/v1/conf/mfcc.conf @@ -0,0 +1,6 @@ +--sample-frequency=16000 +--frame-length=25 # the default is 25 +--low-freq=20 # the default. +--high-freq=3700 # the default is zero meaning use the Nyquist (4k in this case). +--num-ceps=20 # higher than the default which is 12. +--snip-edges=false diff --git a/egs/bn_music_speech/v1/conf/vad.conf b/egs/bn_music_speech/v1/conf/vad.conf new file mode 100644 index 00000000000..a0ca2449b10 --- /dev/null +++ b/egs/bn_music_speech/v1/conf/vad.conf @@ -0,0 +1,2 @@ +--vad-energy-threshold=5.5 +--vad-energy-mean-scale=0.5 diff --git a/egs/bn_music_speech/v1/local/make_annotations_bn.py b/egs/bn_music_speech/v1/local/make_annotations_bn.py new file mode 100755 index 00000000000..53cebf52ea4 --- /dev/null +++ b/egs/bn_music_speech/v1/local/make_annotations_bn.py @@ -0,0 +1,153 @@ +#!/usr/bin/env python +# Copyright 2015 David Snyder +# Apache 2.0. +# +# This script creates four files for each HUB4 Broadcast News +# transcript file. The four files are for the music, speech, ad, +# and other transcripts. Each line of the output files define the +# start and end times of the individual events. +# +# This file is meant to be invoked by make_bn.sh. + +import sys, re, os + +def is_speech(line): + if " end: + print "Skipping annotation where end time is before start time:", line + return start, end + +def extract_other_type2(line): + m = re.search('(?<=S_time=)\d+.\d+', line) + start = float(m.group(0)) + m = re.search('(?<=E_time=)\d+.\d+', line) + end = float(m.group(0)) + if start > end: + print "Skipping annotation where end time is before start time:", line + return start, end + +def extract_music(line): + m = re.search('(?<=Time=)\d+.\d+', line) + time = float(m.group(0)) + m = re.search('(?<=Level=)\w', line) + level = m.group(0) + is_on = False + if level == "L" or level == "H": + is_on = True + elif level == "O": + is_on = False + else: + print "Encountered bad token on line:", line + sys.exit() + return time, is_on + +def extract_other_type1(line): + m = re.search('(?<=Time=)\d+.\d+', line) + time = float(m.group(0)) + m = re.search('(?<=Level=)\w', line) + level = m.group(0) + is_on = False + if level == "L" or level == "H": + is_on = True + elif level == "O": + is_on = False + else: + print "Encountered bad token on line:", line + sys.exit() + return time, is_on + +def process_file(annos): + speech = "" + music = "" + other_type2 = "" + other_type1 = "" + start_new_music_segment = True + start_new_other_segment = True + max_time = 0.0 + prev_music_time = "0.0" + prev_other_time = "0.0" + for line in annos: + if is_speech(line): + speech_start, speech_end = extract_speech(line) + speech = speech + str(speech_start) + " " + str(speech_end) + "\n" + max_time = max(speech_end, max_time) + elif is_other_type2(line): + other_type2_start, other_type2_end = extract_other_type2(line) + other_type2 = other_type2 + str(other_type2_start) + " " + str(other_type2_end) + "\n" + max_time = max(other_type2_end, max_time) + elif is_music(line): + time, is_on = extract_music(line) + max_time = max(time, max_time) + if is_on and start_new_music_segment: + prev_music_time = time + start_new_music_segment = False + elif not is_on and not start_new_music_segment: + music = music + str(prev_music_time) + " " + str(time) + "\n" + start_new_music_segment = True + elif is_other_type1(line): + time, is_on = extract_other_type1(line) + max_time = max(time, max_time) + if is_on and start_new_other_segment: + prev_other_time = time + start_new_other_segment = False + elif not is_on and not start_new_other_segment: + other_type1 = other_type1 + str(prev_other_time) + " " + str(time) + "\n" + start_new_other_segment = True + + if not start_new_music_segment: + music = music + str(prev_music_time) + " " + str(max_time) + "\n" + if not start_new_other_segment: + other_type1 = other_type1 + str(prev_other_time) + " " + str(max_time) + "\n" + + other = other_type1 + other_type2 + return speech, music, other + +def main(): + in_dir = sys.argv[1] + out_dir = sys.argv[2] + utts = "" + for root, dirs, files in os.walk(in_dir): + for file in files: + if file.endswith(".txt"): + anno_in = open(os.path.join(root, file), 'r').readlines() + speech, music, other = process_file(anno_in) + utt = file.replace(".txt", "") + utts = utts + utt + "\n" + speech_fi_str = utt + "_speech.key" + music_fi_str = utt + "_music.key" + other_fi_str = utt + "_other.key" + speech_fi = open(os.path.join(out_dir, speech_fi_str), 'w') + speech_fi.write(speech) + music_fi = open(os.path.join(out_dir, music_fi_str), 'w') + music_fi.write(music) + other_fi = open(os.path.join(out_dir, other_fi_str), 'w') + other_fi.write(other) + utts_fi = open(os.path.join(out_dir, "utt_list"), 'w') + utts_fi.write(utts) + +if __name__=="__main__": + main() + diff --git a/egs/bn_music_speech/v1/local/make_bn.py b/egs/bn_music_speech/v1/local/make_bn.py new file mode 100755 index 00000000000..98836d32534 --- /dev/null +++ b/egs/bn_music_speech/v1/local/make_bn.py @@ -0,0 +1,48 @@ +#!/usr/bin/env python +# Copyright 2015 David Snyder +# Apache 2.0. +# +# Using the annotations created by refine_annotations_bn.py, this script +# creates the segments, utt2spk, and wav.scp files. +# +# This file is meant to be invoked by make_bn.sh. + +import os, sys +wav_dir = sys.argv[1] +out_dir = sys.argv[2] + +utts = open(os.path.join(out_dir, "utt_list"), 'r').readlines() +utts = set(x.rstrip() for x in utts) +wav = "" +segments = "" +utt2spk = "" +for subdir, dirs, files in os.walk(wav_dir): + for file in files: + utt = str(file).replace(".sph", "") + if file.endswith(".sph") and utt in utts: + wav = wav + utt + " sox " + subdir + "/" + utt + ".sph" + " -c 1 -r 16000 -t wav - |\n" +wav_fi = open(os.path.join(out_dir, "wav.scp"), 'w') +wav_fi.write(wav) + +for utt in utts: + music_filename = utt + "_music.key.refined" + speech_filename = utt + "_speech.key.refined" + music_fi = open(os.path.join(out_dir, music_filename), 'r').readlines() + speech_fi = open(os.path.join(out_dir, speech_filename), 'r').readlines() + count = 1 + for line in music_fi: + left, right = line.rstrip().split(" ") + segments = segments + utt + "-music-" + str(count) + " " + utt + " " + left + " " + right + "\n" + utt2spk = utt2spk + utt + "-music-" + str(count) + " " + utt + "-music-" + str(count) + "\n" + count += 1 + count = 1 + for line in speech_fi: + left, right = line.rstrip().split(" ") + segments = segments + utt + "-speech-" + str(count) + " " + utt + " " + left + " " + right + "\n" + utt2spk = utt2spk + utt + "-speech-" + str(count) + " " + utt + "-speech-" + str(count) + "\n" + count += 1 +utt2spk_fi = open(os.path.join(out_dir, "utt2spk"), 'w') +utt2spk_fi.write(utt2spk) +segments_fi = open(os.path.join(out_dir, "segments"), 'w') +segments_fi.write(segments) + diff --git a/egs/bn_music_speech/v1/local/make_bn.sh b/egs/bn_music_speech/v1/local/make_bn.sh new file mode 100755 index 00000000000..5e2a29f0cca --- /dev/null +++ b/egs/bn_music_speech/v1/local/make_bn.sh @@ -0,0 +1,40 @@ +#!/bin/bash +# Copyright 2015 David Snyder +# Apache 2.0. +# +# This script, called by ../run.sh, creates the HUB4 Broadcast News +# data directory. The required datasets can be found at: +# https://catalog.ldc.upenn.edu/LDC97S44 +# https://catalog.ldc.upenn.edu/LDC97T22 + +set -e +sph_dir=$1 +transcript_dir=$2 +data_dir=$3 +tmp_dir=local/bn.tmp + +# These parameters are used when refining the annotations. +# A higher frames_per_second provides better resolution at the +# frame boundaries. Set min_seg to control the minimum length of the +# final segments. It seems that the original annotations for segments +# below half a second are not very accurate, so we test only on segments +# longer than this. +frames_per_sec=100 +min_seg=0.5 + +rm -rf local/bn.tmp +mkdir local/bn.tmp + +echo "$0: preparing annotations..." +local/make_annotations_bn.py ${transcript_dir} ${tmp_dir} +echo "$0: Removing overlapping annotations..." +local/refine_annotations_bn.py ${tmp_dir} ${frames_per_sec} ${min_seg} +echo "$0: Preparing broadcast news data directories ${data_dir}/bn..." +local/make_bn.py ${sph_dir} ${tmp_dir} + +mkdir -p ${data_dir}/bn +cp ${tmp_dir}/wav.scp ${data_dir}/bn/ +cp ${tmp_dir}/utt2spk ${data_dir}/bn/ +cp ${tmp_dir}/segments ${data_dir}/bn/ +rm -rf local/bn.tmp +utils/fix_data_dir.sh data/bn diff --git a/egs/bn_music_speech/v1/local/make_musan.py b/egs/bn_music_speech/v1/local/make_musan.py new file mode 100755 index 00000000000..490de9baa37 --- /dev/null +++ b/egs/bn_music_speech/v1/local/make_musan.py @@ -0,0 +1,97 @@ +#!/usr/bin/env python +# Copyright 2015 David Snyder +# Apache 2.0. +# +# This file is meant to be invoked by make_musan.sh. + +import os, sys + + +def process_music_annotations(path): + utt2spk = {} + utt2vocals = {} + lines = open(path, 'r').readlines() + for line in lines: + utt, genres, vocals, musician = line.rstrip().split()[:4] + # For this application, the musican ID isn't important + utt2spk[utt] = utt + utt2vocals[utt] = vocals == "Y" + return utt2spk, utt2vocals + +def prepare_music(root_dir, use_vocals): + utt2vocals = {} + utt2spk = {} + utt2wav = {} + music_dir = os.path.join(root_dir, "music") + print str(music_dir) + for root, dirs, files in os.walk(music_dir): + for file in files: + file_path = os.path.join(root, file) + if file.endswith(".wav"): + utt = str(file).replace(".wav", "") + utt2wav[utt] = file_path + elif str(file) == "ANNOTATIONS": + utt2spk_part, utt2vocals_part = process_music_annotations(file_path) + utt2spk.update(utt2spk_part) + utt2vocals.update(utt2vocals_part) + utt2spk_str = "" + utt2wav_str = "" + for utt in utt2vocals: + if use_vocals or not utt2vocals[utt]: + utt2spk_str = utt2spk_str + utt + " " + utt2spk[utt] + "\n" + utt2wav_str = utt2wav_str + utt + " " + utt2wav[utt] + "\n" + return utt2spk_str, utt2wav_str + +def prepare_speech(root_dir): + utt2spk = {} + utt2wav = {} + speech_dir = os.path.join(root_dir, "speech") + for root, dirs, files in os.walk(speech_dir): + for file in files: + file_path = os.path.join(root, file) + if file.endswith(".wav"): + utt = str(file).replace(".wav", "") + utt2wav[utt] = file_path + utt2spk[utt] = utt + utt2spk_str = "" + utt2wav_str = "" + for utt in utt2spk: + utt2spk_str = utt2spk_str + utt + " " + utt2spk[utt] + "\n" + utt2wav_str = utt2wav_str + utt + " " + utt2wav[utt] + "\n" + return utt2spk_str, utt2wav_str + +def prepare_noise(root_dir): + utt2spk = {} + utt2wav = {} + speech_dir = os.path.join(root_dir, "noise") + for root, dirs, files in os.walk(speech_dir): + for file in files: + file_path = os.path.join(root, file) + if file.endswith(".wav"): + utt = str(file).replace(".wav", "") + utt2wav[utt] = file_path + utt2spk[utt] = utt + utt2spk_str = "" + utt2wav_str = "" + for utt in utt2spk: + utt2spk_str = utt2spk_str + utt + " " + utt2spk[utt] + "\n" + utt2wav_str = utt2wav_str + utt + " " + utt2wav[utt] + "\n" + return utt2spk_str, utt2wav_str + +def main(): + in_dir = sys.argv[1] + out_dir = sys.argv[2] + use_vocals = sys.argv[3] == "Y" + utt2spk_music, utt2wav_music = prepare_music(in_dir, use_vocals) + utt2spk_speech, utt2wav_speech = prepare_speech(in_dir) + utt2spk_noise, utt2wav_noise = prepare_noise(in_dir) + utt2spk = utt2spk_speech + utt2spk_music + utt2spk_noise + utt2wav = utt2wav_speech + utt2wav_music + utt2wav_noise + wav_fi = open(os.path.join(out_dir, "wav.scp"), 'w') + wav_fi.write(utt2wav) + utt2spk_fi = open(os.path.join(out_dir, "utt2spk"), 'w') + utt2spk_fi.write(utt2spk) + + +if __name__=="__main__": + main() diff --git a/egs/bn_music_speech/v1/local/make_musan.sh b/egs/bn_music_speech/v1/local/make_musan.sh new file mode 100755 index 00000000000..1faac0ef58c --- /dev/null +++ b/egs/bn_music_speech/v1/local/make_musan.sh @@ -0,0 +1,37 @@ +#!/bin/bash +# Copyright 2015 David Snyder +# Apache 2.0. +# +# This script, called by ../run.sh, creates the MUSAN +# data directory. The required dataset is freely available at +# http://www.openslr.org/17/ + +set -e +in_dir=$1 +data_dir=$2 +use_vocals='Y' + +rm -rf local/musan.tmp +mkdir local/musan.tmp + +echo "Preparing ${data_dir}/musan..." +mkdir -p ${data_dir}/musan +local/make_musan.py ${in_dir} ${data_dir}/musan ${use_vocals} +utils/fix_data_dir.sh ${data_dir}/musan + +grep "music" ${data_dir}/musan/utt2spk > local/musan.tmp/utt2spk_music +grep "speech" ${data_dir}/musan/utt2spk > local/musan.tmp/utt2spk_speech +grep "noise" ${data_dir}/musan/utt2spk > local/musan.tmp/utt2spk_noise +utils/subset_data_dir.sh --utt-list local/musan.tmp/utt2spk_music \ + ${data_dir}/musan ${data_dir}/musan_music +utils/subset_data_dir.sh --utt-list local/musan.tmp/utt2spk_speech \ + ${data_dir}/musan ${data_dir}/musan_speech +utils/subset_data_dir.sh --utt-list local/musan.tmp/utt2spk_noise \ + ${data_dir}/musan ${data_dir}/musan_noise + +utils/fix_data_dir.sh ${data_dir}/musan_music +utils/fix_data_dir.sh ${data_dir}/musan_speech +utils/fix_data_dir.sh ${data_dir}/musan_noise + +rm -rf local/musan.tmp + diff --git a/egs/bn_music_speech/v1/local/print_scores.py b/egs/bn_music_speech/v1/local/print_scores.py new file mode 100755 index 00000000000..c2b587cdcad --- /dev/null +++ b/egs/bn_music_speech/v1/local/print_scores.py @@ -0,0 +1,22 @@ +#!/usr/bin/env python +# Copyright 2015 David Snyder +# Apache 2.0. +# +# This script prints out lines of the form: +# . +# Its output is meant to be used as input to the binary +# compute-eer. The Broadcast News utterances have either +# "music" or "speech" in the utterance name, and so we +# can simply check if the utterance name contains one of +# those strings to determine if it is a target or nontarget +# utterance. We arbitrarily pick music to be the target class. + +import sys +utt2score = open(sys.argv[1], 'r').readlines() +for i in range(0, len(utt2score)): + utt, score = utt2score[i].rstrip().split() + if "music" in utt: + type = "target" + else: + type = "nontarget" + print score, type diff --git a/egs/bn_music_speech/v1/local/refine_annotations_bn.py b/egs/bn_music_speech/v1/local/refine_annotations_bn.py new file mode 100755 index 00000000000..52ac87c8640 --- /dev/null +++ b/egs/bn_music_speech/v1/local/refine_annotations_bn.py @@ -0,0 +1,116 @@ +#!/usr/bin/env python +# Copyright 2015 David Snyder +# Apache 2.0. +# +# This script refines the annotation files produced by +# make_annotations_bn.py. In order to create unambiguous annotations, +# we remove any part of a segment that overlaps with another. Also, +# this script merges together contiguous segments that have the +# same annotation, and ensures that only segments longer than a +# designated length are created. +# +# This file is meant to be invoked from make_bn.sh. +import sys, os + +def seg_to_string(seg): + start = seg[0] + end = seg[1] + if start < end: + return str(start) + " " + str(end) + "\n" + else: + return "" + +def process_segs(raw_segs): + segs = [] + for seg in raw_segs: + lower, upper = map(float, seg.rstrip().split(" ")) + segs.append((lower, upper)) + return segs + +def resegment(music, speech, other, frame_length, min_seg): + frame2classes = [] + max_duration = 0 + all_segs = music + speech + other + for (start, end) in all_segs: + if end > max_duration: + max_duration = end + num_frames = int(max_duration) * frame_length + for i in range(0, num_frames): + frame2classes.append([]) + + annotate_frames(frame2classes, music, "music", frame_length, num_frames) + annotate_frames(frame2classes, speech, "speech", frame_length, num_frames) + annotate_frames(frame2classes, other, "other", frame_length, num_frames) + + curr_class = None + for i in range(0, len(frame2classes)): + if len(frame2classes[i]) != 1 or frame2classes[i][0] == "other": + curr_class = "other" + elif frame2classes[i][0] == "music": + curr_class = "music" + elif frame2classes[i][0] == "speech": + curr_class = "speech" + else: + curr_class = "other" + frame2classes[i] = curr_class + + new_music = [] + new_speech = [] + curr_class = frame2classes[0] + start_frame = 0 + for i in range(1, len(frame2classes)): + if curr_class != frame2classes[i]: + start = float(start_frame) / frame_length + end = float(i) / frame_length + if end - start > min_seg: + if curr_class == "music": + new_music.append((start, end)) + elif curr_class == "speech": + new_speech.append((start, end)) + start_frame = i + curr_class = frame2classes[i] + + return new_music, new_speech + + +def annotate_frames(frame2classes, segs, annotation, frame_length, max_duration): + for (start, end) in segs: + frame_start = min(int(start * frame_length), max_duration) + frame_end = min(int(end * frame_length), max_duration) + for i in range(frame_start, frame_end): + frame2classes[i].append(annotation) + +def main(): + out_dir = sys.argv[1] + frames_per_sec = int(sys.argv[2]) + min_seg_length = float(sys.argv[3]) + + utts = open(os.path.join(out_dir, "utt_list"), 'r').readlines() + for line in utts: + speech_filename = os.path.join(out_dir, line.rstrip() + "_speech.key") + music_filename = os.path.join(out_dir, line.rstrip() + "_music.key") + other_filename = os.path.join(out_dir, line.rstrip() + "_other.key") + raw_speech_segs = open(speech_filename, 'r').readlines() + raw_music_segs = open(music_filename, 'r').readlines() + raw_other_segs = open(other_filename, 'r').readlines() + speech_segs = process_segs(raw_speech_segs) + music_segs = process_segs(raw_music_segs) + other_segs = process_segs(raw_other_segs) + music_segs, speech_segs = resegment(music_segs, speech_segs, other_segs, frames_per_sec, min_seg_length) + + speech_output = "" + music_output = "" + for seg in music_segs: + music_output = music_output + seg_to_string(seg) + for seg in speech_segs: + speech_output = speech_output + seg_to_string(seg) + + speech_fi = open(speech_filename + ".refined", 'w') + music_fi = open(music_filename + ".refined", 'w') + speech_fi.write(speech_output) + music_fi.write(music_output) + speech_fi.close() + music_fi.close() + +if __name__=="__main__": + main() diff --git a/egs/bn_music_speech/v1/path.sh b/egs/bn_music_speech/v1/path.sh new file mode 100755 index 00000000000..e50f57c5271 --- /dev/null +++ b/egs/bn_music_speech/v1/path.sh @@ -0,0 +1,5 @@ +export KALDI_ROOT=`pwd`/../../.. +export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$KALDI_ROOT/tools/sph2pipe_v2.5:$PWD:$PATH +[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1 +. $KALDI_ROOT/tools/config/common_path.sh +export LC_ALL=C diff --git a/egs/bn_music_speech/v1/run.sh b/egs/bn_music_speech/v1/run.sh new file mode 100755 index 00000000000..67935ead983 --- /dev/null +++ b/egs/bn_music_speech/v1/run.sh @@ -0,0 +1,89 @@ +#!/bin/bash +# Copyright 2015 David Snyder +# Apache 2.0. +# +# This example demonstrates music/speech discrimination. This recipe trains +# three GMMs on the music, speech and noise portions of the MUSAN corpus. +# We test the systems on Broadcast News. The Broadcast News test data consists +# of short segments of either speech or music. The classification decisions +# are made at a segment level from the average likelihoods of two GMMs. +# Results (EERs) are inline in comments below. +# +# See README.txt for more info on data required. + +. cmd.sh +. path.sh +set -e +mfccdir=`pwd`/mfcc +vaddir=`pwd`/mfcc + +local/make_bn.sh /export/corpora5/LDC/LDC97S44 \ + /export/corpora/LDC/LDC97T22 data + +local/make_musan.sh /export/corpora/JHU/musan data + +steps/make_mfcc.sh --mfcc-config conf/mfcc.conf --nj 30 --cmd "$train_cmd" \ + data/musan_speech exp/make_mfcc $mfccdir +steps/make_mfcc.sh --mfcc-config conf/mfcc.conf --nj 30 --cmd "$train_cmd" \ + data/musan_music exp/make_mfcc $mfccdir +steps/make_mfcc.sh --mfcc-config conf/mfcc.conf --nj 5 --cmd "$train_cmd" \ + data/musan_noise exp/make_mfcc $mfccdir +steps/make_mfcc.sh --mfcc-config conf/mfcc.conf --nj 30 --cmd "$train_cmd" \ + data/bn exp/make_mfcc $mfccdir + +utils/fix_data_dir.sh data/musan_speech +utils/fix_data_dir.sh data/musan_music +utils/fix_data_dir.sh data/musan_noise +utils/fix_data_dir.sh data/bn + +sid/compute_vad_decision.sh --nj 20 --cmd "$train_cmd" \ + data/musan_speech exp/make_vad $vaddir +sid/compute_vad_decision.sh --nj 5 --cmd "$train_cmd" \ + data/musan_noise exp/make_vad $vaddir +sid/compute_vad_decision.sh --nj 20 --cmd "$train_cmd" \ + data/musan_music exp/make_vad $vaddir +sid/compute_vad_decision.sh --nj 20 --cmd "$train_cmd" \ + data/bn exp/make_vad $vaddir + +sid/train_diag_ubm.sh --nj 10 --cmd "$train_cmd" --delta-window 2 \ + data/musan_noise 32 exp/diag_ubm_noise & +sid/train_diag_ubm.sh --nj 20 --cmd "$train_cmd" --delta-window 2 \ + data/musan_speech 32 exp/diag_ubm_speech & +sid/train_diag_ubm.sh --nj 20 --cmd "$train_cmd" --delta-window 2 \ + data/musan_music 32 exp/diag_ubm_music +wait; + +sid/train_full_ubm.sh --nj 20 --cmd "$train_cmd" \ + --remove-low-count-gaussians false data/musan_noise \ + exp/diag_ubm_noise exp/full_ubm_noise & +sid/train_full_ubm.sh --nj 20 --cmd "$train_cmd" \ + --remove-low-count-gaussians false data/musan_speech \ + exp/diag_ubm_speech exp/full_ubm_speech & +sid/train_full_ubm.sh --nj 20 --cmd "$train_cmd" \ + --remove-low-count-gaussians false data/musan_music \ + exp/diag_ubm_music exp/full_ubm_music +wait; + +sid/music_id.sh --cmd "$train_cmd" --nj 40 \ + exp/full_ubm_music exp/full_ubm_speech \ + data/bn exp/bn_music_speech +sid/music_id.sh --cmd "$train_cmd" --nj 40 \ + exp/full_ubm_noise exp/full_ubm_speech \ + data/bn exp/bn_noise_speech + +printf "EER using GMMs trained on music and speech" +compute-eer <(local/print_scores.py exp/bn_music_speech/ratio) +# Equal error rate is 0.344234%, at threshold 0.525752 +printf "\nEER using GMM trained on noise instead of music" +compute-eer <(local/print_scores.py exp/bn_noise_speech/ratio) +# Equal error rate is 0.860585%, at threshold 0.123218 + +# The following script replaces the VAD decisions originally computed by +# the energy-based VAD. It uses the GMMs trained earlier in the script +# to make frame-level decisions. Due to the mapping provided in +# conf/merge_vad_map.txt, "0" corresponds to silence, "1" to speech, and +# "2" to music. +sid/compute_vad_decision_gmm.sh --nj 40 --cmd "$train_cmd" \ + --merge-map-config conf/merge_vad_map.txt --use-energy-vad true \ + data/bn exp/full_ubm_noise exp/full_ubm_speech/ \ + exp/full_ubm_music/ exp/vad_gmm exp/vad_gmm/ diff --git a/egs/bn_music_speech/v1/sid b/egs/bn_music_speech/v1/sid new file mode 120000 index 00000000000..a9cdb0f0013 --- /dev/null +++ b/egs/bn_music_speech/v1/sid @@ -0,0 +1 @@ +../../sre10/v1/sid \ No newline at end of file diff --git a/egs/bn_music_speech/v1/steps b/egs/bn_music_speech/v1/steps new file mode 120000 index 00000000000..83b3d2b59a3 --- /dev/null +++ b/egs/bn_music_speech/v1/steps @@ -0,0 +1 @@ +../../sre10/v1/steps \ No newline at end of file diff --git a/egs/bn_music_speech/v1/utils b/egs/bn_music_speech/v1/utils new file mode 120000 index 00000000000..726839e0092 --- /dev/null +++ b/egs/bn_music_speech/v1/utils @@ -0,0 +1 @@ +../../sre10/v1/utils \ No newline at end of file diff --git a/egs/callhome_egyptian/s5/RESULTS b/egs/callhome_egyptian/s5/RESULTS new file mode 100644 index 00000000000..1d1c8fd1690 --- /dev/null +++ b/egs/callhome_egyptian/s5/RESULTS @@ -0,0 +1,226 @@ +-------------------------------------------------------------------------------------- +Triphone with mono alignment (small) +-------------------------------------------------------------------------------------- +exp/tri1/decode_dev/wer_11 %WER 67.90 [ 22753 / 33509, 1778 ins, 5369 del, 15606 sub ] +exp/tri1/decode_dev/wer_12 %WER 67.91 [ 22757 / 33509, 1555 ins, 5782 del, 15420 sub ] +exp/tri1/decode_dev/wer_10 %WER 68.14 [ 22834 / 33509, 2041 ins, 4902 del, 15891 sub ] +exp/tri1/decode_dev/wer_13 %WER 68.19 [ 22851 / 33509, 1428 ins, 6227 del, 15196 sub ] +exp/tri1/decode_dev/wer_9 %WER 68.68 [ 23015 / 33509, 2379 ins, 4422 del, 16214 sub ] +exp/tri1/decode_dev/wer_8 %WER 69.53 [ 23298 / 33509, 2748 ins, 4024 del, 16526 sub ] +exp/tri1/decode_dev/wer_7 %WER 70.92 [ 23766 / 33509, 3180 ins, 3609 del, 16977 sub ] +exp/tri1/decode_dev/wer_6 %WER 72.71 [ 24366 / 33509, 3674 ins, 3218 del, 17474 sub ] +exp/tri1/decode_dev/wer_5 %WER 75.02 [ 25137 / 33509, 4247 ins, 2886 del, 18004 sub ] +exp/tri1/decode_dev/wer_4 %WER 77.08 [ 25830 / 33509, 4794 ins, 2625 del, 18411 sub ] +exp/tri1/decode_dev/wer_3 %WER 79.37 [ 26595 / 33509, 5340 ins, 2424 del, 18831 sub ] +exp/tri1/decode_dev/wer_2 %WER 81.52 [ 27317 / 33509, 5869 ins, 2268 del, 19180 sub ] + +-------------------------------------------------------------------------------------- +Triphone with tri alignments +-------------------------------------------------------------------------------------- +exp/tri2/decode_dev/wer_11 %WER 66.41 [ 22253 / 33509, 1841 ins, 5001 del, 15411 sub ] +exp/tri2/decode_dev/wer_12 %WER 66.44 [ 22262 / 33509, 1620 ins, 5463 del, 15179 sub ] +exp/tri2/decode_dev/wer_13 %WER 66.61 [ 22322 / 33509, 1448 ins, 5926 del, 14948 sub ] +exp/tri2/decode_dev/wer_10 %WER 66.73 [ 22360 / 33509, 2153 ins, 4575 del, 15632 sub ] +exp/tri2/decode_dev/wer_9 %WER 67.36 [ 22573 / 33509, 2453 ins, 4102 del, 16018 sub ] +exp/tri2/decode_dev/wer_8 %WER 68.65 [ 23003 / 33509, 2874 ins, 3741 del, 16388 sub ] +exp/tri2/decode_dev/wer_7 %WER 70.19 [ 23521 / 33509, 3380 ins, 3363 del, 16778 sub ] +exp/tri2/decode_dev/wer_6 %WER 72.17 [ 24183 / 33509, 3950 ins, 3003 del, 17230 sub ] +exp/tri2/decode_dev/wer_5 %WER 74.31 [ 24901 / 33509, 4476 ins, 2715 del, 17710 sub ] +exp/tri2/decode_dev/wer_4 %WER 76.48 [ 25627 / 33509, 5044 ins, 2460 del, 18123 sub ] +exp/tri2/decode_dev/wer_3 %WER 78.52 [ 26312 / 33509, 5544 ins, 2251 del, 18517 sub ] +exp/tri2/decode_dev/wer_2 %WER 80.92 [ 27115 / 33509, 6114 ins, 2105 del, 18896 sub ] + +-------------------------------------------------------------------------------------- +Triphone + LDA + MLLT +-------------------------------------------------------------------------------------- +exp/tri3a/decode_dev/wer_11 %WER 62.31 [ 20878 / 33509, 1793 ins, 4872 del, 14213 sub ] +exp/tri3a/decode_dev/wer_12 %WER 62.33 [ 20887 / 33509, 1581 ins, 5349 del, 13957 sub ] +exp/tri3a/decode_dev/wer_10 %WER 62.51 [ 20947 / 33509, 2058 ins, 4415 del, 14474 sub ] +exp/tri3a/decode_dev/wer_13 %WER 62.68 [ 21005 / 33509, 1388 ins, 5856 del, 13761 sub ] +exp/tri3a/decode_dev/wer_9 %WER 63.20 [ 21177 / 33509, 2369 ins, 3972 del, 14836 sub ] +exp/tri3a/decode_dev/wer_8 %WER 64.29 [ 21543 / 33509, 2771 ins, 3604 del, 15168 sub ] +exp/tri3a/decode_dev/wer_7 %WER 65.63 [ 21993 / 33509, 3209 ins, 3288 del, 15496 sub ] +exp/tri3a/decode_dev/wer_6 %WER 67.63 [ 22661 / 33509, 3723 ins, 2970 del, 15968 sub ] +exp/tri3a/decode_dev/wer_5 %WER 69.68 [ 23350 / 33509, 4241 ins, 2686 del, 16423 sub ] +exp/tri3a/decode_dev/wer_4 %WER 71.83 [ 24069 / 33509, 4774 ins, 2439 del, 16856 sub ] +exp/tri3a/decode_dev/wer_3 %WER 74.14 [ 24842 / 33509, 5326 ins, 2278 del, 17238 sub ] +exp/tri3a/decode_dev/wer_2 %WER 76.28 [ 25561 / 33509, 5814 ins, 2152 del, 17595 sub ] + +-------------------------------------------------------------------------------------- ++ SAT + fMLLR +-------------------------------------------------------------------------------------- +exp/tri4a/decode_dev/wer_12 %WER 58.22 [ 19510 / 33509, 1796 ins, 4447 del, 13267 sub ] +exp/tri4a/decode_dev/wer_11 %WER 58.29 [ 19532 / 33509, 1998 ins, 4124 del, 13410 sub ] +exp/tri4a/decode_dev/wer_13 %WER 58.47 [ 19593 / 33509, 1634 ins, 4808 del, 13151 sub ] +exp/tri4a/decode_dev/wer_10 %WER 58.61 [ 19641 / 33509, 2283 ins, 3790 del, 13568 sub ] +exp/tri4a/decode_dev/wer_9 %WER 59.29 [ 19867 / 33509, 2591 ins, 3455 del, 13821 sub ] +exp/tri4a/decode_dev/wer_8 %WER 60.60 [ 20307 / 33509, 2969 ins, 3133 del, 14205 sub ] +exp/tri4a/decode_dev/wer_7 %WER 62.11 [ 20812 / 33509, 3471 ins, 2790 del, 14551 sub ] +exp/tri4a/decode_dev/wer_6 %WER 64.08 [ 21471 / 33509, 3976 ins, 2508 del, 14987 sub ] +exp/tri4a/decode_dev/wer_5 %WER 66.25 [ 22200 / 33509, 4563 ins, 2283 del, 15354 sub ] +exp/tri4a/decode_dev/wer_4 %WER 68.40 [ 22920 / 33509, 5091 ins, 2106 del, 15723 sub ] +exp/tri4a/decode_dev/wer_3 %WER 70.36 [ 23576 / 33509, 5576 ins, 1933 del, 16067 sub ] +exp/tri4a/decode_dev/wer_2 %WER 72.33 [ 24236 / 33509, 6047 ins, 1819 del, 16370 sub ] + +-------------------------------------------------------------------------------------- ++ More leaves and gaussians +-------------------------------------------------------------------------------------- +exp/tri5a/decode_dev/wer_12 %WER 58.06 [ 19456 / 33509, 1866 ins, 4379 del, 13211 sub ] +exp/tri5a/decode_dev/wer_11 %WER 58.19 [ 19498 / 33509, 2105 ins, 4031 del, 13362 sub ] +exp/tri5a/decode_dev/wer_13 %WER 58.37 [ 19558 / 33509, 1670 ins, 4734 del, 13154 sub ] +exp/tri5a/decode_dev/wer_10 %WER 58.64 [ 19651 / 33509, 2364 ins, 3696 del, 13591 sub ] +exp/tri5a/decode_dev/wer_9 %WER 59.46 [ 19923 / 33509, 2711 ins, 3386 del, 13826 sub ] +exp/tri5a/decode_dev/wer_8 %WER 60.49 [ 20270 / 33509, 3093 ins, 3040 del, 14137 sub ] +exp/tri5a/decode_dev/wer_7 %WER 62.28 [ 20871 / 33509, 3592 ins, 2751 del, 14528 sub ] +exp/tri5a/decode_dev/wer_6 %WER 64.11 [ 21483 / 33509, 4107 ins, 2465 del, 14911 sub ] +exp/tri5a/decode_dev/wer_5 %WER 66.27 [ 22208 / 33509, 4674 ins, 2274 del, 15260 sub ] +exp/tri5a/decode_dev/wer_4 %WER 68.31 [ 22891 / 33509, 5171 ins, 2076 del, 15644 sub ] +exp/tri5a/decode_dev/wer_3 %WER 70.35 [ 23574 / 33509, 5646 ins, 1893 del, 16035 sub ] +exp/tri5a/decode_dev/wer_2 %WER 72.46 [ 24279 / 33509, 6152 ins, 1784 del, 16343 sub ] +@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ +*** Test *** +exp/tri5a/decode_test/wer_10 %WER 57.09 [ 8927 / 15637, 879 ins, 1783 del, 6265 sub ] +exp/tri5a/decode_test/wer_11 %WER 56.60 [ 8851 / 15637, 782 ins, 1946 del, 6123 sub ] +exp/tri5a/decode_test/wer_12 %WER 56.46 [ 8828 / 15637, 688 ins, 2085 del, 6055 sub ] +exp/tri5a/decode_test/wer_13 %WER 56.73 [ 8871 / 15637, 629 ins, 2241 del, 6001 sub ] +exp/tri5a/decode_test/wer_2 %WER 68.81 [ 10760 / 15637, 2364 ins, 932 del, 7464 sub ] +exp/tri5a/decode_test/wer_3 %WER 66.74 [ 10436 / 15637, 2152 ins, 995 del, 7289 sub ] +exp/tri5a/decode_test/wer_4 %WER 64.55 [ 10093 / 15637, 1919 ins, 1073 del, 7101 sub ] +exp/tri5a/decode_test/wer_5 %WER 62.86 [ 9830 / 15637, 1727 ins, 1163 del, 6940 sub ] +exp/tri5a/decode_test/wer_6 %WER 61.03 [ 9543 / 15637, 1497 ins, 1286 del, 6760 sub ] +exp/tri5a/decode_test/wer_7 %WER 59.44 [ 9295 / 15637, 1311 ins, 1391 del, 6593 sub ] +exp/tri5a/decode_test/wer_8 %WER 58.41 [ 9134 / 15637, 1141 ins, 1515 del, 6478 sub ] +exp/tri5a/decode_test/wer_9 %WER 57.72 [ 9025 / 15637, 1008 ins, 1651 del, 6366 sub ] +@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ +*** Supplement *** +exp/tri5a/decode_sup/wer_10 %WER 63.77 [ 11101 / 17409, 1164 ins, 2476 del, 7461 sub ] +exp/tri5a/decode_sup/wer_11 %WER 63.52 [ 11059 / 17409, 1042 ins, 2666 del, 7351 sub ] +exp/tri5a/decode_sup/wer_12 %WER 63.29 [ 11019 / 17409, 930 ins, 2884 del, 7205 sub ] +exp/tri5a/decode_sup/wer_13 %WER 63.12 [ 10989 / 17409, 814 ins, 3124 del, 7051 sub ] +exp/tri5a/decode_sup/wer_2 %WER 75.75 [ 13187 / 17409, 2952 ins, 1279 del, 8956 sub ] +exp/tri5a/decode_sup/wer_3 %WER 74.18 [ 12914 / 17409, 2728 ins, 1371 del, 8815 sub ] +exp/tri5a/decode_sup/wer_4 %WER 72.28 [ 12584 / 17409, 2491 ins, 1444 del, 8649 sub ] +exp/tri5a/decode_sup/wer_5 %WER 70.04 [ 12194 / 17409, 2206 ins, 1562 del, 8426 sub ] +exp/tri5a/decode_sup/wer_6 %WER 68.20 [ 11873 / 17409, 1944 ins, 1719 del, 8210 sub ] +exp/tri5a/decode_sup/wer_7 %WER 66.61 [ 11596 / 17409, 1720 ins, 1880 del, 7996 sub ] +exp/tri5a/decode_sup/wer_8 %WER 65.37 [ 11381 / 17409, 1500 ins, 2075 del, 7806 sub ] +exp/tri5a/decode_sup/wer_9 %WER 64.45 [ 11220 / 17409, 1345 ins, 2275 del, 7600 sub ] +@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ +*** H5 *** +exp/tri5a/decode_h5/wer_10 %WER 61.38 [ 10303 / 16785, 839 ins, 2581 del, 6883 sub ] +exp/tri5a/decode_h5/wer_11 %WER 61.27 [ 10285 / 16785, 730 ins, 2760 del, 6795 sub ] +exp/tri5a/decode_h5/wer_12 %WER 61.41 [ 10307 / 16785, 646 ins, 2953 del, 6708 sub ] +exp/tri5a/decode_h5/wer_13 %WER 61.61 [ 10342 / 16785, 568 ins, 3132 del, 6642 sub ] +exp/tri5a/decode_h5/wer_2 %WER 71.50 [ 12001 / 16785, 2156 ins, 1385 del, 8460 sub ] +exp/tri5a/decode_h5/wer_3 %WER 69.96 [ 11742 / 16785, 1975 ins, 1476 del, 8291 sub ] +exp/tri5a/decode_h5/wer_4 %WER 68.23 [ 11453 / 16785, 1765 ins, 1569 del, 8119 sub ] +exp/tri5a/decode_h5/wer_5 %WER 66.48 [ 11159 / 16785, 1595 ins, 1703 del, 7861 sub ] +exp/tri5a/decode_h5/wer_6 %WER 64.88 [ 10890 / 16785, 1411 ins, 1839 del, 7640 sub ] +exp/tri5a/decode_h5/wer_7 %WER 63.67 [ 10687 / 16785, 1229 ins, 2019 del, 7439 sub ] +exp/tri5a/decode_h5/wer_8 %WER 62.63 [ 10513 / 16785, 1082 ins, 2193 del, 7238 sub ] +exp/tri5a/decode_h5/wer_9 %WER 61.95 [ 10399 / 16785, 959 ins, 2398 del, 7042 sub ] + + +-------------------------------------------------------------------------------------- +pNorm-Ensemble DNN +-------------------------------------------------------------------------------------- +exp/tri6a_dnn/decode_dev/wer_10 %WER 50.55 [ 16939 / 33509, 1407 ins, 4188 del, 11344 sub ] +exp/tri6a_dnn/decode_dev/wer_11 %WER 51.03 [ 17098 / 33509, 1239 ins, 4563 del, 11296 sub ] +exp/tri6a_dnn/decode_dev/wer_12 %WER 51.69 [ 17321 / 33509, 1126 ins, 5010 del, 11185 sub ] +exp/tri6a_dnn/decode_dev/wer_13 %WER 52.54 [ 17607 / 33509, 1010 ins, 5466 del, 11131 sub ] +exp/tri6a_dnn/decode_dev/wer_14 %WER 53.52 [ 17933 / 33509, 908 ins, 5918 del, 11107 sub ] +exp/tri6a_dnn/decode_dev/wer_15 %WER 54.36 [ 18214 / 33509, 817 ins, 6294 del, 11103 sub ] +exp/tri6a_dnn/decode_dev/wer_16 %WER 55.08 [ 18456 / 33509, 739 ins, 6622 del, 11095 sub ] +exp/tri6a_dnn/decode_dev/wer_8 %WER 50.34 [ 16869 / 33509, 1841 ins, 3456 del, 11572 sub ] +exp/tri6a_dnn/decode_dev/wer_9 %WER 50.31 [ 16859 / 33509, 1617 ins, 3794 del, 11448 sub ] +@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ +*** Test *** +exp/tri6a_dnn/decode_test/wer_10 %WER 49.76 [ 7781 / 15637, 542 ins, 2022 del, 5217 sub ] +exp/tri6a_dnn/decode_test/wer_11 %WER 50.40 [ 7881 / 15637, 489 ins, 2195 del, 5197 sub ] +exp/tri6a_dnn/decode_test/wer_12 %WER 50.82 [ 7947 / 15637, 431 ins, 2356 del, 5160 sub ] +exp/tri6a_dnn/decode_test/wer_13 %WER 51.72 [ 8087 / 15637, 375 ins, 2591 del, 5121 sub ] +exp/tri6a_dnn/decode_test/wer_14 %WER 52.65 [ 8233 / 15637, 324 ins, 2800 del, 5109 sub ] +exp/tri6a_dnn/decode_test/wer_15 %WER 53.57 [ 8376 / 15637, 284 ins, 2986 del, 5106 sub ] +exp/tri6a_dnn/decode_test/wer_16 %WER 54.37 [ 8502 / 15637, 246 ins, 3131 del, 5125 sub ] +exp/tri6a_dnn/decode_test/wer_8 %WER 49.33 [ 7714 / 15637, 696 ins, 1721 del, 5297 sub ] +exp/tri6a_dnn/decode_test/wer_9 %WER 49.54 [ 7747 / 15637, 632 ins, 1873 del, 5242 sub ] +@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ +*** Supplement *** +exp/tri6a_dnn/decode_sup/wer_10 %WER 58.14 [ 10121 / 17409, 895 ins, 2684 del, 6542 sub ] +exp/tri6a_dnn/decode_sup/wer_11 %WER 58.41 [ 10169 / 17409, 791 ins, 2927 del, 6451 sub ] +exp/tri6a_dnn/decode_sup/wer_12 %WER 58.71 [ 10220 / 17409, 681 ins, 3214 del, 6325 sub ] +exp/tri6a_dnn/decode_sup/wer_13 %WER 59.14 [ 10295 / 17409, 593 ins, 3502 del, 6200 sub ] +exp/tri6a_dnn/decode_sup/wer_14 %WER 59.84 [ 10417 / 17409, 515 ins, 3741 del, 6161 sub ] +exp/tri6a_dnn/decode_sup/wer_15 %WER 60.33 [ 10503 / 17409, 450 ins, 3974 del, 6079 sub ] +exp/tri6a_dnn/decode_sup/wer_16 %WER 60.78 [ 10581 / 17409, 393 ins, 4157 del, 6031 sub ] +exp/tri6a_dnn/decode_sup/wer_8 %WER 58.57 [ 10197 / 17409, 1194 ins, 2262 del, 6741 sub ] +exp/tri6a_dnn/decode_sup/wer_9 %WER 58.15 [ 10123 / 17409, 1023 ins, 2477 del, 6623 sub ] +@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ +*** H5 *** +exp/tri6a_dnn/decode_h5/wer_10 %WER 55.98 [ 9396 / 16785, 592 ins, 2734 del, 6070 sub ] +exp/tri6a_dnn/decode_h5/wer_11 %WER 56.11 [ 9418 / 16785, 495 ins, 2974 del, 5949 sub ] +exp/tri6a_dnn/decode_h5/wer_12 %WER 56.75 [ 9526 / 16785, 418 ins, 3247 del, 5861 sub ] +exp/tri6a_dnn/decode_h5/wer_13 %WER 57.61 [ 9670 / 16785, 368 ins, 3482 del, 5820 sub ] +exp/tri6a_dnn/decode_h5/wer_14 %WER 58.37 [ 9797 / 16785, 318 ins, 3739 del, 5740 sub ] +exp/tri6a_dnn/decode_h5/wer_15 %WER 59.32 [ 9957 / 16785, 284 ins, 3960 del, 5713 sub ] +exp/tri6a_dnn/decode_h5/wer_16 %WER 59.93 [ 10060 / 16785, 256 ins, 4127 del, 5677 sub ] +exp/tri6a_dnn/decode_h5/wer_8 %WER 55.60 [ 9333 / 16785, 750 ins, 2323 del, 6260 sub ] +exp/tri6a_dnn/decode_h5/wer_9 %WER 55.76 [ 9360 / 16785, 666 ins, 2531 del, 6163 sub ] + +-------------------------------------------------------------------------------------- +TDNN + iVector +-------------------------------------------------------------------------------------- +exp/nnet3/nnet_tdnn_a/decode_dev/wer_10 %WER 53.55 [ 17943 / 33509, 1332 ins, 4855 del, 11756 sub ] +exp/nnet3/nnet_tdnn_a/decode_dev/wer_11 %WER 53.82 [ 18033 / 33509, 1176 ins, 5278 del, 11579 sub ] +exp/nnet3/nnet_tdnn_a/decode_dev/wer_12 %WER 54.17 [ 18153 / 33509, 1040 ins, 5696 del, 11417 sub ] +exp/nnet3/nnet_tdnn_a/decode_dev/wer_13 %WER 54.75 [ 18345 / 33509, 912 ins, 6111 del, 11322 sub ] +exp/nnet3/nnet_tdnn_a/decode_dev/wer_2 %WER 65.73 [ 22026 / 33509, 4773 ins, 2143 del, 15110 sub ] +exp/nnet3/nnet_tdnn_a/decode_dev/wer_3 %WER 62.48 [ 20937 / 33509, 4112 ins, 2383 del, 14442 sub ] +exp/nnet3/nnet_tdnn_a/decode_dev/wer_4 %WER 59.65 [ 19989 / 33509, 3488 ins, 2699 del, 13802 sub ] +exp/nnet3/nnet_tdnn_a/decode_dev/wer_5 %WER 57.41 [ 19238 / 33509, 2942 ins, 3032 del, 13264 sub ] +exp/nnet3/nnet_tdnn_a/decode_dev/wer_6 %WER 55.58 [ 18624 / 33509, 2461 ins, 3356 del, 12807 sub ] +exp/nnet3/nnet_tdnn_a/decode_dev/wer_7 %WER 54.44 [ 18242 / 33509, 2092 ins, 3714 del, 12436 sub ] +exp/nnet3/nnet_tdnn_a/decode_dev/wer_8 %WER 53.63 [ 17970 / 33509, 1766 ins, 4087 del, 12117 sub ] +exp/nnet3/nnet_tdnn_a/decode_dev/wer_9 %WER 53.51 [ 17931 / 33509, 1533 ins, 4452 del, 11946 sub ] +@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ +*** Test *** +exp/nnet3/nnet_tdnn_a/decode_test/wer_10 %WER 52.29 [ 8177 / 15637, 536 ins, 2190 del, 5451 sub ] +exp/nnet3/nnet_tdnn_a/decode_test/wer_11 %WER 52.49 [ 8208 / 15637, 474 ins, 2373 del, 5361 sub ] +exp/nnet3/nnet_tdnn_a/decode_test/wer_12 %WER 53.03 [ 8293 / 15637, 420 ins, 2558 del, 5315 sub ] +exp/nnet3/nnet_tdnn_a/decode_test/wer_13 %WER 53.52 [ 8369 / 15637, 361 ins, 2721 del, 5287 sub ] +exp/nnet3/nnet_tdnn_a/decode_test/wer_2 %WER 64.18 [ 10036 / 15637, 2048 ins, 980 del, 7008 sub ] +exp/nnet3/nnet_tdnn_a/decode_test/wer_3 %WER 60.84 [ 9513 / 15637, 1726 ins, 1076 del, 6711 sub ] +exp/nnet3/nnet_tdnn_a/decode_test/wer_4 %WER 58.23 [ 9106 / 15637, 1471 ins, 1210 del, 6425 sub ] +exp/nnet3/nnet_tdnn_a/decode_test/wer_5 %WER 55.77 [ 8720 / 15637, 1206 ins, 1351 del, 6163 sub ] +exp/nnet3/nnet_tdnn_a/decode_test/wer_6 %WER 54.19 [ 8474 / 15637, 1005 ins, 1505 del, 5964 sub ] +exp/nnet3/nnet_tdnn_a/decode_test/wer_7 %WER 53.08 [ 8300 / 15637, 827 ins, 1689 del, 5784 sub ] +exp/nnet3/nnet_tdnn_a/decode_test/wer_8 %WER 52.43 [ 8198 / 15637, 712 ins, 1841 del, 5645 sub ] +exp/nnet3/nnet_tdnn_a/decode_test/wer_9 %WER 52.10 [ 8147 / 15637, 619 ins, 1993 del, 5535 sub ] +@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ +*** Supplement *** +exp/nnet3/nnet_tdnn_a/decode_sup/wer_10 %WER 80.31 [ 13981 / 17409, 407 ins, 8528 del, 5046 sub ] +exp/nnet3/nnet_tdnn_a/decode_sup/wer_11 %WER 80.42 [ 14001 / 17409, 360 ins, 8752 del, 4889 sub ] +exp/nnet3/nnet_tdnn_a/decode_sup/wer_12 %WER 80.52 [ 14017 / 17409, 318 ins, 8968 del, 4731 sub ] +exp/nnet3/nnet_tdnn_a/decode_sup/wer_13 %WER 80.79 [ 14065 / 17409, 291 ins, 9155 del, 4619 sub ] +exp/nnet3/nnet_tdnn_a/decode_sup/wer_2 %WER 85.93 [ 14960 / 17409, 1330 ins, 6454 del, 7176 sub ] +exp/nnet3/nnet_tdnn_a/decode_sup/wer_3 %WER 84.65 [ 14737 / 17409, 1151 ins, 6635 del, 6951 sub ] +exp/nnet3/nnet_tdnn_a/decode_sup/wer_4 %WER 83.31 [ 14504 / 17409, 968 ins, 6890 del, 6646 sub ] +exp/nnet3/nnet_tdnn_a/decode_sup/wer_5 %WER 82.52 [ 14366 / 17409, 839 ins, 7159 del, 6368 sub ] +exp/nnet3/nnet_tdnn_a/decode_sup/wer_6 %WER 81.66 [ 14216 / 17409, 711 ins, 7477 del, 6028 sub ] +exp/nnet3/nnet_tdnn_a/decode_sup/wer_7 %WER 81.08 [ 14116 / 17409, 631 ins, 7750 del, 5735 sub ] +exp/nnet3/nnet_tdnn_a/decode_sup/wer_8 %WER 80.52 [ 14017 / 17409, 547 ins, 7999 del, 5471 sub ] +exp/nnet3/nnet_tdnn_a/decode_sup/wer_9 %WER 80.31 [ 13982 / 17409, 468 ins, 8269 del, 5245 sub ] +@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ +*** H5 *** +exp/nnet3/nnet_tdnn_a/decode_h5/wer_10 %WER 85.15 [ 14293 / 16785, 170 ins, 9449 del, 4674 sub ] +exp/nnet3/nnet_tdnn_a/decode_h5/wer_11 %WER 85.24 [ 14307 / 16785, 142 ins, 9700 del, 4465 sub ] +exp/nnet3/nnet_tdnn_a/decode_h5/wer_12 %WER 85.51 [ 14353 / 16785, 119 ins, 9920 del, 4314 sub ] +exp/nnet3/nnet_tdnn_a/decode_h5/wer_13 %WER 85.81 [ 14403 / 16785, 106 ins, 10113 del, 4184 sub ] +exp/nnet3/nnet_tdnn_a/decode_h5/wer_2 %WER 88.11 [ 14790 / 16785, 749 ins, 7107 del, 6934 sub ] +exp/nnet3/nnet_tdnn_a/decode_h5/wer_3 %WER 87.16 [ 14629 / 16785, 630 ins, 7324 del, 6675 sub ] +exp/nnet3/nnet_tdnn_a/decode_h5/wer_4 %WER 86.45 [ 14510 / 16785, 509 ins, 7607 del, 6394 sub ] +exp/nnet3/nnet_tdnn_a/decode_h5/wer_5 %WER 85.71 [ 14387 / 16785, 423 ins, 7925 del, 6039 sub ] +exp/nnet3/nnet_tdnn_a/decode_h5/wer_6 %WER 85.24 [ 14307 / 16785, 341 ins, 8248 del, 5718 sub ] +exp/nnet3/nnet_tdnn_a/decode_h5/wer_7 %WER 84.99 [ 14266 / 16785, 277 ins, 8617 del, 5372 sub ] +exp/nnet3/nnet_tdnn_a/decode_h5/wer_8 %WER 84.85 [ 14242 / 16785, 230 ins, 8916 del, 5096 sub ] +exp/nnet3/nnet_tdnn_a/decode_h5/wer_9 %WER 84.92 [ 14253 / 16785, 192 ins, 9200 del, 4861 sub ] diff --git a/egs/callhome_egyptian/s5/cmd.sh b/egs/callhome_egyptian/s5/cmd.sh index ab29f13d4cc..71dd849a93b 100755 --- a/egs/callhome_egyptian/s5/cmd.sh +++ b/egs/callhome_egyptian/s5/cmd.sh @@ -1,18 +1,15 @@ -# "queue.pl" uses qsub. The options to it are -# options to qsub. If you have GridEngine installed, -# change this to a queue you have access to. -# Otherwise, use "run.pl", which will run jobs locally -# (make sure your --num-jobs options are no more than -# the number of cpus on your machine. - -#train_cmd='queue.pl -q all.q@a03.clsp.jhu.edu,all.q@a06.clsp.jhu.edu,all.q@a05.clsp.jhu.edu,all.q@v01.clsp.jhu.edu,all.q@a10.clsp.jhu.edu,all.q@a04.clsp.jhu.edu,all.q@a13.clsp.jhu.edu,all.q@a11.clsp.jhu.edu -l arch=*64' -#decode_cmd='queue.pl -q all.q@a03.clsp.jhu.edu,all.q@a06.clsp.jhu.edu,all.q@a05.clsp.jhu.edu,all.q@v01.clsp.jhu.edu,all.q@a10.clsp.jhu.edu,all.q@a04.clsp.jhu.edu,all.q@a13.clsp.jhu.edu,all.q@a11.clsp.jhu.edu -l arch=*64' -train_cmd="queue.pl -l arch=*64" -decode_cmd="queue.pl -l arch=*64" -#train_cmd="run.pl" -# Do training locally. Note: for jobs on smallish subsets, -# it's way faster to run on a single machine with a handful of CPUs, as -# you avoid the latency of starting GridEngine jobs. - - - +# you can change cmd.sh depending on what type of queue you are using. +# If you have no queueing system and want to run on a local machine, you +# can change all instances 'queue.pl' to run.pl (but be careful and run +# commands one by one: most recipes will exhaust the memory on your +# machine). queue.pl works with GridEngine (qsub). slurm.pl works +# with slurm. Different queues are configured differently, with different +# queue names and different ways of specifying things like memory; +# to account for these differences you can create and edit the file +# conf/queue.conf to match your queue's configuration. Search for +# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, +# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. + +export train_cmd="queue.pl --mem 2G" +export decode_cmd="queue.pl --mem 4G" +export mkgraph_cmd="queue.pl --mem 8G" diff --git a/egs/callhome_egyptian/s5/conf/mfcc_hires.conf b/egs/callhome_egyptian/s5/conf/mfcc_hires.conf new file mode 100644 index 00000000000..ad519cf4f9c --- /dev/null +++ b/egs/callhome_egyptian/s5/conf/mfcc_hires.conf @@ -0,0 +1,11 @@ +# config for high-resolution MFCC features, intended for neural network training +# Note: we keep all cepstra, so it has the same info as filterbank features, +# but MFCC is more easily compressible (because less correlated) which is why +# we prefer this method. +--use-energy=false # use average of log energy, not energy. +--num-mel-bins=40 # similar to Google's setup. +--num-ceps=40 # there is no dimensionality reduction. +--low-freq=20 # low cutoff frequency for mel bins... this is high-bandwidth data, so + # there might be some information at the low end. +--high-freq=-400 # high cutoff frequently, relative to Nyquist of 8000 (=7600) +--sample-frequency=8000 diff --git a/egs/callhome_egyptian/s5/conf/online_cmvn.conf b/egs/callhome_egyptian/s5/conf/online_cmvn.conf new file mode 100644 index 00000000000..cbdaf5f281c --- /dev/null +++ b/egs/callhome_egyptian/s5/conf/online_cmvn.conf @@ -0,0 +1 @@ +# configuration file for apply-cmvn-online, used in the script ../local/online/run_online_decoding_nnet2.sh diff --git a/egs/callhome_egyptian/s5/local/callhome_create_test_lang.sh b/egs/callhome_egyptian/s5/local/callhome_create_test_lang.sh index aaa45f8e4e1..78059f153a8 100755 --- a/egs/callhome_egyptian/s5/local/callhome_create_test_lang.sh +++ b/egs/callhome_egyptian/s5/local/callhome_create_test_lang.sh @@ -1,4 +1,4 @@ -#!/bin/bash +#!/bin/bash # if [ -f path.sh ]; then . path.sh; fi @@ -12,25 +12,18 @@ mkdir -p data/lang_test cp -r data/lang/* data/lang_test # grep -v ' ' etc. is only for future-proofing this script. Our -# LM doesn't have these "invalid combinations". These can cause +# LM doesn't have these "invalid combinations". These can cause # determinization failures of CLG [ends up being epsilon cycles]. # Note: remove_oovs.pl takes a list of words in the LM that aren't in # our word list. Since our LM doesn't have any, we just give it # /dev/null [we leave it in the script to show how you'd do it]. gunzip -c "$arpa_lm" | \ - grep -v ' ' | \ - grep -v ' ' | \ - grep -v ' ' | \ - arpa2fst - | fstprint | \ - utils/remove_oovs.pl /dev/null | \ - utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=data/lang_test/words.txt \ - --osymbols=data/lang_test/words.txt --keep_isymbols=false --keep_osymbols=false | \ - fstrmepsilon | fstarcsort --sort_type=ilabel > data/lang_test/G.fst - fstisstochastic data/lang_test/G.fst + arpa2fst --disambig-symbol=#0 \ + --read-symbol-table=data/lang_test/words.txt - data/lang_test/G.fst echo "Checking how stochastic G is (the first of these numbers should be small):" -fstisstochastic data/lang_test/G.fst +fstisstochastic data/lang_test/G.fst ## Check lexicon. ## just have a look and make sure it seems sane. @@ -59,4 +52,3 @@ fsttablecompose data/lang/L_disambig.fst data/lang_test/G.fst | \ echo "$0 succeeded" - diff --git a/egs/callhome_egyptian/s5/local/callhome_data_prep.sh b/egs/callhome_egyptian/s5/local/callhome_data_prep.sh index c8e5b30038e..f8a4430aeda 100755 --- a/egs/callhome_egyptian/s5/local/callhome_data_prep.sh +++ b/egs/callhome_egyptian/s5/local/callhome_data_prep.sh @@ -1,8 +1,8 @@ #!/bin/bash # # Johns Hopkins University : (Gaurav Kumar) -# The input is the Callhome Egyptian Arabic Dataset which contains *.sph files -# In addition the transcripts are needed as well. +# The input is the Callhome Egyptian Arabic Dataset which contains *.sph files +# In addition the transcripts are needed as well. #TODO: Rewrite intro, copyright stuff and dir information # To be run from one directory above this script. @@ -12,7 +12,7 @@ stage=0 export LC_ALL=C -if [ $# -lt 2 ]; then +if [ $# -lt 6 ]; then echo "Arguments should be the location of the Callhome Egyptian Arabic Speech and Transcript Directories, se e ../run.sh for example." exit 1; @@ -45,8 +45,18 @@ ln -s $* links # Basic spot checks to see if we got the data that we needed if [ ! -d links/LDC97S45 -o ! -d links/LDC97T19 ]; then - echo "The speech and the data directories need to be named LDC97S45 and LDC97T19 respecti -vely" + echo "The speech and the data directories need to be named LDC97S45 and LDC97T19 respectively" + exit 1; +fi +if [ ! -d links/LDC2002S37 -o ! -d links/LDC2002T38 ]; +then + echo "The Callhome supplement directories need to be named LDC2002S37 and LDC2002T38." + o + exit 1; +fi +if [ ! -d links/LDC2002S22 -o ! -d links/LDC2002T39 ]; +then + echo "The H5-ECA directories need to be named LDC2002S22 and LDC2002T39." exit 1; fi @@ -63,27 +73,71 @@ then exit 1; fi +if [ ! -d links/LDC2002S37/SPEECH ]; +then + echo "Callhome supplement directories missing or not properly organised within the speech data dir" + exit 1; +fi + +if [ ! -d links/LDC2002T38/ch_ara_transcr_suppl/transcr ] +then + echo "Callhome supplement Transcript directories missing or not properly organised" + exit 1; +fi + +if [ ! -d links/LDC2002S22/SPEECH ]; +then + echo "H5 directories missing or not properly organised within the speech data dir" + exit 1; +fi + +if [ ! -d links/LDC2002T39/transcr ] +then + echo "H5 Transcript directories missing or not properly organised" + exit 1; +fi + speech_train=$dir/links/LDC97S45/CALLHOME/ARABIC/TRAIN speech_dev=$dir/links/LDC97S45/CALLHOME/ARABIC/DEVTEST speech_test=$dir/links/LDC97S45/CALLHOME/ARABIC/EVLTEST -transcripts_train=$dir/links/LDC97T19/callhome_arabic_trans_970711/transcrp/train/roman +transcripts_train=$dir/links/LDC97T19/callhome_arabic_trans_970711/transcrp/train/roman transcripts_dev=$dir/links/LDC97T19/callhome_arabic_trans_970711/transcrp/devtest/roman transcripts_test=$dir/links/LDC97T19/callhome_arabic_trans_970711/transcrp/evaltest/roman - -fcount_train=`find ${speech_train} -iname '*.SPH' | wc -l` -fcount_dev=`find ${speech_dev} -iname '*.SPH' | wc -l` -fcount_test=`find ${speech_test} -iname '*.SPH' | wc -l` -fcount_t_train=`find ${transcripts_train} -iname '*.txt' | wc -l` -fcount_t_dev=`find ${transcripts_dev} -iname '*.txt' | wc -l` -fcount_t_test=`find ${transcripts_test} -iname '*.txt' | wc -l` +speech_sup=$dir/links/LDC2002S37/SPEECH +transcripts_sup=$dir/links/LDC2002T38/ch_ara_transcr_suppl/transcr +speech_h5=$dir/links/LDC2002S22/SPEECH +transcripts_h5=$dir/links/LDC2002T39/transcr + +fcount_train=`find ${speech_train} -iname '*.SPH' | wc -l` +fcount_dev=`find ${speech_dev} -iname '*.SPH' | wc -l` +fcount_test=`find ${speech_test} -iname '*.SPH' | wc -l` +fcount_t_train=`find ${transcripts_train} -iname '*.txt' | wc -l` +fcount_t_dev=`find ${transcripts_dev} -iname '*.txt' | wc -l` +fcount_t_test=`find ${transcripts_test} -iname '*.txt' | wc -l` +fcount_sup=`find ${speech_sup} -iname '*.SPH' | wc -l` +fcount_t_sup=`find ${transcripts_sup} -iname '*.txt' | wc -l` +fcount_h5=`find ${speech_h5} -iname '*.SPH' | wc -l` +fcount_t_h5=`find ${transcripts_h5} -iname '*.txt' | wc -l` #Now check if we got all the files that we needed -if [ $fcount_train != 80 -o $fcount_dev != 20 -o $fcount_test != 20 -o $fcount_t_train != 80 -o $fcount_t_dev != 20 -o $fcount_t_test != 20 ]; -then - echo "Incorrect number of files in the data directories" +if [ $fcount_train != 80 -o $fcount_dev != 20 -o $fcount_test != 20 -o $fcount_t_train != 80 -o $fcount_t_dev != 20 -o $fcount_t_test != 20 ]; +then + echo "Incorrect number of files in the data directories" echo "The paritions should contain 80/20/20 files" - exit 1; -fi + exit 1; +fi +if [ $fcount_sup != 20 -o $fcount_t_sup != 20 ]; +then + echo "Incorrect number of files in the ECA sup data directories" + echo "The paritions should contain 20/20 files" + exit 1; +fi +if [ $fcount_h5 != 20 -o $fcount_t_h5 != 20 ]; +then + echo "Incorrect number of files in the H5 data directories" + echo "The paritions should contain 20/20 files" + exit 1; +fi if [ $stage -le 0 ]; then #Gather all the speech files together to create a file list @@ -91,15 +145,19 @@ if [ $stage -le 0 ]; then find $speech_train -iname '*.sph'; find $speech_dev -iname '*.sph'; find $speech_test -iname '*.sph'; + find $speech_sup -iname '*.sph'; + find $speech_h5 -iname '*.sph'; ) > $tmpdir/callhome_train_sph.flist #Get all the transcripts in one place - ( + ( find $transcripts_train -iname '*.txt'; find $transcripts_dev -iname '*.txt'; find $transcripts_test -iname '*.txt'; - ) > $tmpdir/callhome_train_transcripts.flist + find $transcripts_sup -iname '*.txt'; + find $transcripts_h5 -iname '*.txt'; + ) > $tmpdir/callhome_train_transcripts.flist fi @@ -109,7 +167,7 @@ if [ $stage -le 1 ]; then mv $tmpdir/reco2file_and_channel $dir/train_all/ fi -if [ $stage -le 2 ]; then +if [ $stage -le 2 ]; then sort $tmpdir/text.1 | grep -v '((' | \ awk '{if (NF > 1){ print; }}' | \ sed 's:<\s*[/]*\s*\s*for[ei][ei]g[nh]\s*\w*>::g' | \ @@ -145,7 +203,7 @@ if [ $stage -le 2 ]; then ! cat $dir/train_all/text | perl -ane 'm:([^-]+)-([AB])-(\S+): || die "Bad line $_;"; print "$1-$2-$3 $1-$2\n"; ' > $dir/train_all/utt2spk \ && echo "Error producing utt2spk file" && exit 1; - # Remove utterances that have the same start and end time. Corresponding text entries will be removed when use + # Remove utterances that have the same start and end time. Corresponding text entries will be removed when use # fix_data_dir.sh and validate_data_dir.sh later cat $dir/train_all/text | perl -ane 'm:((\S+-[AB])-(\d+)-(\d+))\s: || die; $utt = $1; $reco = $2; $s = sprintf("%.2f", 0.01*$3); $e = sprintf("%.2f", 0.01*$4); print "$utt $reco $s $e\n"; ' | \ diff --git a/egs/callhome_egyptian/s5/local/convert_symtable_to_utf.py b/egs/callhome_egyptian/s5/local/convert_symtable_to_utf.py new file mode 100644 index 00000000000..f5b69a1ff86 --- /dev/null +++ b/egs/callhome_egyptian/s5/local/convert_symtable_to_utf.py @@ -0,0 +1,38 @@ +#!/usr/bin/env py + +# Converts a romanized ECA word list (symbol table) to +# a version in the arabic script + +import sys +import codecs + +if len(sys.argv) < 3: + print "USAGE: local/convert_symtable_to_utf.py [SYMTABLE] [ECA-LEXICON]" + print "E.g., local/convert_symtable_to_utf.py data/lang/words.txt \ + /export/corpora/LDC/LDC99L22" + sys.exit(1) + +# Note that the ECA lexicon's default encoding is ISO-8859-6, not UTF8 +symtable = codecs.open(sys.argv[1], encoding="utf8") +lexicon = codecs.open(sys.argv[2] + "/callhome_arabic_lexicon_991012/ar_lex.v07", encoding="iso-8859-6") + +dict_cache = {} +# First read off the dictionary and store stuff in a cache +for line in lexicon: + line = line.strip().split() + roman = line[0].strip() + script = line[1].strip() + assert roman not in dict_cache + dict_cache[roman] = script + +# Now read the symbol table and write off the ut8 versions +for line in symtable: + line = line.strip().split() + if line[0] in dict_cache: + output = dict_cache[line[0]] + " " + line[1] + else: + output = line[0] + " " + line[1] + sys.stdout.write(output.encode("utf-8") + "\n") + +lexicon.close() +symtable.close() diff --git a/egs/callhome_egyptian/s5/local/create_splits b/egs/callhome_egyptian/s5/local/create_splits index 98b27b0109e..80a32cea394 100755 --- a/egs/callhome_egyptian/s5/local/create_splits +++ b/egs/callhome_egyptian/s5/local/create_splits @@ -11,7 +11,7 @@ fi splitFile=$1 # Train first -for split in train dev test +for split in train dev test sup h5 do cp -r $train_all $data_dir/$split diff --git a/egs/callhome_egyptian/s5/local/nnet3/run_ivector_common.sh b/egs/callhome_egyptian/s5/local/nnet3/run_ivector_common.sh new file mode 100755 index 00000000000..db79dd138b2 --- /dev/null +++ b/egs/callhome_egyptian/s5/local/nnet3/run_ivector_common.sh @@ -0,0 +1,84 @@ +#!/bin/bash + +# Inherited from the WSJ nnet3 recipe, modified for use with ECA + +# this script is called from scripts like run_ms.sh; it does the common stages +# of the build, such as feature extraction. +# This is actually the same as local/online/run_nnet2_common.sh, except +# for the directory names. + +mfccdir=mfcc + +stage=1 + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + + +if [ $stage -le 1 ]; then + for datadir in train dev test sup h5; do + utils/copy_data_dir.sh data/$datadir data/${datadir}_hires + steps/make_mfcc.sh --nj 40 --mfcc-config conf/mfcc_hires.conf \ + --cmd "$train_cmd" data/${datadir}_hires exp/make_hires/$datadir $mfccdir || exit 1; + steps/compute_cmvn_stats.sh data/${datadir}_hires exp/make_hires/$datadir $mfccdir || exit 1; + done + utils/subset_data_dir.sh --first data/train 7388 data/train_small || exit 1 + utils/subset_data_dir.sh --first data/train_hires 7388 data/train_small_hires || exit 1 +fi + +if [ $stage -le 2 ]; then + # We need to build a small system just because we need the LDA+MLLT transform + # to train the diag-UBM on top of. We align the si84 data for this purpose. + steps/align_fmllr.sh --nj 40 --cmd "$train_cmd" \ + data/train_small data/lang exp/tri5a exp/nnet3/tri5a_ali_small +fi + +if [ $stage -le 3 ]; then + # Train a small system just for its LDA+MLLT transform. We use --num-iters 13 + # because after we get the transform (12th iter is the last), any further + # training is pointless. + steps/train_lda_mllt.sh --cmd "$train_cmd" --num-iters 13 \ + --realign-iters "" \ + --splice-opts "--left-context=3 --right-context=3" \ + 5000 10000 data/train_small_hires data/lang \ + exp/nnet3/tri5a_ali_small exp/nnet3/tri5b +fi + +if [ $stage -le 4 ]; then + mkdir -p exp/nnet3 + + steps/online/nnet2/train_diag_ubm.sh --cmd "$train_cmd" --nj 30 \ + --num-frames 400000 data/train_small_hires 256 exp/nnet3/tri5b exp/nnet3/diag_ubm +fi + +if [ $stage -le 5 ]; then + # even though $nj is just 10, each job uses multiple processes and threads. + steps/online/nnet2/train_ivector_extractor.sh --cmd "$train_cmd" --nj 10 \ + data/train_hires exp/nnet3/diag_ubm exp/nnet3/extractor || exit 1; +fi + +if [ $stage -le 6 ]; then + # We extract iVectors on all the train_si284 data, which will be what we + # train the system on. + + # having a larger number of speakers is helpful for generalization, and to + # handle per-utterance decoding well (iVector starts at zero). + steps/online/nnet2/copy_data_dir.sh --utts-per-spk-max 2 data/train_hires \ + data/train_hires_max2 + + steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 30 \ + data/train_hires_max2 exp/nnet3/extractor exp/nnet3/ivectors_train || exit 1; +fi + +if [ $stage -le 7 ]; then + rm exp/nnet3/.error 2>/dev/null + for data in dev test sup h5; do + steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 8 \ + data/${data}_hires exp/nnet3/extractor exp/nnet3/ivectors_${data} || touch exp/nnet3/.error & + done + wait + [ -f exp/nnet3/.error ] && echo "$0: error extracting iVectors." && exit 1; +fi + +exit 0; diff --git a/egs/callhome_egyptian/s5/local/nnet3/run_tdnn.sh b/egs/callhome_egyptian/s5/local/nnet3/run_tdnn.sh new file mode 100755 index 00000000000..bd0a6afbda6 --- /dev/null +++ b/egs/callhome_egyptian/s5/local/nnet3/run_tdnn.sh @@ -0,0 +1,62 @@ +#!/bin/bash + +# this is the standard "tdnn" system, built in nnet3; it's what we use to +# call multi-splice. + +# At this script level we don't support not running on GPU, as it would be painfully slow. +# If you want to run without GPU you'd have to call train_tdnn.sh with --gpu false, +# --num-threads 16 and --minibatch-size 128. + +stage=0 +train_stage=-10 +dir=exp/nnet3/nnet_tdnn_a +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + + +if ! cuda-compiled; then + cat <&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1 +. $KALDI_ROOT/tools/config/common_path.sh export LC_ALL=C diff --git a/egs/callhome_egyptian/s5/run.sh b/egs/callhome_egyptian/s5/run.sh index 53753e31be2..9d1fa692da0 100755 --- a/egs/callhome_egyptian/s5/run.sh +++ b/egs/callhome_egyptian/s5/run.sh @@ -15,16 +15,19 @@ set -e eca_speech=/export/corpora/LDC/LDC97S45 eca_transcripts=/export/corpora/LDC/LDC97T19 eca_lexicon=/export/corpora/LDC/LDC99L22 +sup_speech=/export/corpora/LDC/LDC2002S37 +sup_transcripts=/export/corpora/LDC/LDC2002T38 +h5_speech=/export/corpora/LDC/LDC2002S22 +h5_transcripts=/export/corpora/LDC/LDC2002T39 split=local/splits -local/callhome_data_prep.sh $eca_speech $eca_transcripts +local/callhome_data_prep.sh $eca_speech $eca_transcripts $sup_speech $sup_transcripts $h5_speech $h5_transcripts local/callhome_prepare_dict.sh $eca_lexicon # Added c,j, v to the non silences phones manually utils/prepare_lang.sh data/local/dict "" data/local/lang data/lang - # Make sure that you do not use your test and your dev sets to train the LM # Some form of cross validation is possible where you decode your dev/set based on an # LM that is trained on everything but that that conversation @@ -47,6 +50,8 @@ local/create_splits $split # Now compute CMVN stats for the train, dev and test subsets steps/compute_cmvn_stats.sh data/dev exp/make_mfcc/dev $mfccdir steps/compute_cmvn_stats.sh data/test exp/make_mfcc/test $mfccdir +steps/compute_cmvn_stats.sh data/sup exp/make_mfcc/sup $mfccdir +steps/compute_cmvn_stats.sh data/h5 exp/make_mfcc/h5 $mfccdir steps/compute_cmvn_stats.sh data/train exp/make_mfcc/train $mfccdir @@ -57,7 +62,7 @@ steps/compute_cmvn_stats.sh data/train exp/make_mfcc/train $mfccdir # utterances from those. steps/train_mono.sh --nj 10 --cmd "$train_cmd" \ - data/train data/lang exp/mono0a + data/train data/lang exp/mono0a steps/align_si.sh --nj 30 --cmd "$train_cmd" \ data/train data/lang exp/mono0a exp/mono0a_ali || exit 1; @@ -77,12 +82,11 @@ steps/train_deltas.sh --cmd "$train_cmd" \ 1400 15000 data/train data/lang exp/tri1_ali exp/tri2 || exit 1; ( - utils/mkgraph.sh data/lang_test exp/tri2 exp/tri2/graph || exit 1; + utils/mkgraph.sh data/lang_test exp/tri2 exp/tri2/graph || exit 1; steps/decode.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \ exp/tri2/graph data/dev exp/tri2/decode_dev || exit 1; )& - steps/align_si.sh --nj 30 --cmd "$train_cmd" \ data/train data/lang exp/tri2 exp/tri2_ali || exit 1; @@ -125,15 +129,51 @@ steps/train_sat.sh --cmd "$train_cmd" \ exp/tri5a/graph data/dev exp/tri5a/decode_dev )& -steps/decode_fmllr.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \ -exp/tri5a/graph data/test exp/tri5a/decode_test +( + steps/decode_fmllr.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \ + exp/tri5a/graph data/test exp/tri5a/decode_test + # Decode Supplement and H5 + steps/decode_fmllr.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \ + exp/tri5a/graph data/sup exp/tri5a/decode_sup + steps/decode_fmllr.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \ + exp/tri5a/graph data/h5 exp/tri5a/decode_h5 +)& + +dnn_cpu_parallel_opts=(--minibatch-size 128 --max-change 10 --num-jobs-nnet 8 --num-threads 16 \ + --parallel-opts "-pe smp 16" --cmd "queue.pl -l arch=*64 -l mem_free=2G,ram_free=1G") +dnn_gpu_parallel_opts=(--minibatch-size 512 --max-change 40 --num-jobs-nnet 4 --num-threads 1 \ + --parallel-opts "-l gpu=1" --cmd "queue.pl -l arch=*64 -l mem_free=2G,ram_free=1G") + +steps/nnet2/train_pnorm_ensemble.sh \ + --mix-up 5000 --initial-learning-rate 0.008 --final-learning-rate 0.0008\ + --num-hidden-layers 4 --pnorm-input-dim 2000 --pnorm-output-dim 200\ + --cmd "$train_cmd" \ + "${dnn_gpu_parallel_opts[@]}" \ + --ensemble-size 4 --initial-beta 0.1 --final-beta 5 \ + data/train data/lang exp/tri5a_ali exp/tri6a_dnn -# Decode CALLHOME -steps/decode_fmllr.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \ -exp/tri5a/graph data/callhome_test exp/tri5a/decode_callhome_test -steps/decode_fmllr.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \ -exp/tri5a/graph data/callhome_dev exp/tri5a/decode_callhome_dev -steps/decode_fmllr.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \ -exp/tri5a/graph data/callhome_train exp/tri5a/decode_callhome_train +( + steps/nnet2/decode.sh --nj 13 --cmd "$decode_cmd" --num-threads 4 --parallel-opts " -pe smp 4" \ + --scoring-opts "--min-lmwt 8 --max-lmwt 16" --transform-dir exp/tri5a/decode_dev exp/tri5a/graph data/dev exp/tri6a_dnn/decode_dev +) & + +# Decode test sets +( + steps/nnet2/decode.sh --nj 13 --cmd "$decode_cmd" --num-threads 4 --parallel-opts " -pe smp 4" \ + --scoring-opts "--min-lmwt 8 --max-lmwt 16" --transform-dir exp/tri5a/decode_test exp/tri5a/graph data/test exp/tri6a_dnn/decode_test + steps/nnet2/decode.sh --nj 13 --cmd "$decode_cmd" --num-threads 4 --parallel-opts " -pe smp 4" \ + --scoring-opts "--min-lmwt 8 --max-lmwt 16" --transform-dir exp/tri5a/decode_sup exp/tri5a/graph data/sup exp/tri6a_dnn/decode_sup + steps/nnet2/decode.sh --nj 13 --cmd "$decode_cmd" --num-threads 4 --parallel-opts " -pe smp 4" \ + --scoring-opts "--min-lmwt 8 --max-lmwt 16" --transform-dir exp/tri5a/decode_h5 exp/tri5a/graph data/h5 exp/tri6a_dnn/decode_h5 +) & + +wait + +# (TDNN + iVectors) training +# Note that the alignments used by run_tdnn.sh come from the pnorm-ensemble model +# If you choose to skip ensemble training (which is slow), use the best +# fmllr alignments available (tri4a) +# You can modify this in local/nnet/run_tdnn.sh +local/nnet3/run_tdnn.sh exit 0; diff --git a/egs/chime1/s5/cmd.sh b/egs/chime1/s5/cmd.sh index dda6226f419..0dcd5a9200f 100755 --- a/egs/chime1/s5/cmd.sh +++ b/egs/chime1/s5/cmd.sh @@ -1,39 +1,18 @@ -# "queue.pl" uses qsub. The options to it are -# options to qsub. If you have GridEngine installed, -# change this to a queue you have access to. -# Otherwise, use "run.pl", which will run jobs locally -# (make sure your --num-jobs options are no more than -# the number of cpus on your machine. - -#a) JHU cluster options -#export train_cmd="queue.pl -l arch=*64" -#export decode_cmd="queue.pl -l arch=*64,mem_free=2G,ram_free=2G" -#export mkgraph_cmd="queue.pl -l arch=*64,ram_free=4G,mem_free=4G" - -#export cuda_cmd="..." - - -#b) BUT cluster options -#export train_cmd="queue.pl -q all.q@@blade -l ram_free=1200M,mem_free=1200M" -#export decode_cmd="queue.pl -q all.q@@blade -l ram_free=1700M,mem_free=1700M" -#export decodebig_cmd="queue.pl -q all.q@@blade -l ram_free=4G,mem_free=4G" - -#export cuda_cmd="queue.pl -q long.q@@pco203 -l gpu=1" -#export cuda_cmd="queue.pl -q long.q@pcspeech-gpu" -#export mkgraph_cmd="queue.pl -q all.q@@servers -l ram_free=4G,mem_free=4G" - - -#c) USFD cluster options -#config="conf/queue_usfd.conf" -#export train_cmd="queue.pl --config $config --mem 8G --rmem 4G" -#export decode_cmd="queue.pl --config $config --mem 8G --rmem 4G" -#export mkgraph_cmd="queue.pl --config $config --mem 8G --rmem 4G" -#export cuda_cmd="queue.pl --config $config --mem 24G --rmem 20G --gpu 1 --time 24:00:00" - - -#d) run it locally... -export train_cmd=run.pl -export decode_cmd=run.pl -export cuda_cmd=run.pl -export mkgraph_cmd=run.pl - +# you can change cmd.sh depending on what type of queue you are using. +# If you have no queueing system and want to run on a local machine, you +# can change all instances 'queue.pl' to run.pl (but be careful and run +# commands one by one: most recipes will exhaust the memory on your +# machine). queue.pl works with GridEngine (qsub). slurm.pl works +# with slurm. Different queues are configured differently, with different +# queue names and different ways of specifying things like memory; +# to account for these differences you can create and edit the file +# conf/queue.conf to match your queue's configuration. Search for +# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, +# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. + +export train_cmd="queue.pl --mem 2G" +export decode_cmd="queue.pl --mem 4G" +export mkgraph_cmd="queue.pl --mem 8G" + +# the use of cuda_cmd is deprecated, but it's still used in this recipe. +export cuda_cmd="queue.pl --gpu 1" diff --git a/egs/chime1/s5/path.sh b/egs/chime1/s5/path.sh index 59966f91a53..1a6fb5f891b 100755 --- a/egs/chime1/s5/path.sh +++ b/egs/chime1/s5/path.sh @@ -1,3 +1,5 @@ export KALDI_ROOT=`pwd`/../../.. -export PATH=$PWD/utils/:$KALDI_ROOT/src/bin:$KALDI_ROOT/tools/openfst/bin:$KALDI_ROOT/src/fstbin/:$KALDI_ROOT/src/gmmbin/:$KALDI_ROOT/src/featbin/:$KALDI_ROOT/src/lm/:$KALDI_ROOT/src/sgmmbin/:$KALDI_ROOT/src/sgmm2bin/:$KALDI_ROOT/src/fgmmbin/:$KALDI_ROOT/src/latbin/:$KALDI_ROOT/src/nnetbin:$KALDI_ROOT/src/nnet2bin/:$KALDI_ROOT/src/kwsbin:$KALDI_ROOT/src/online2bin/:$KALDI_ROOT/src/ivectorbin/:$KALDI_ROOT/src/lmbin/:$PWD:$PATH +export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH +[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1 +. $KALDI_ROOT/tools/config/common_path.sh export LC_ALL=C diff --git a/egs/chime2/s5/cmd.sh b/egs/chime2/s5/cmd.sh index 8bb00fe0ec6..0dcd5a9200f 100644 --- a/egs/chime2/s5/cmd.sh +++ b/egs/chime2/s5/cmd.sh @@ -1,30 +1,18 @@ -# "queue.pl" uses qsub. The options to it are -# options to qsub. If you have GridEngine installed, -# change this to a queue you have access to. -# Otherwise, use "run.pl", which will run jobs locally -# (make sure your --num-jobs options are no more than -# the number of cpus on your machine. +# you can change cmd.sh depending on what type of queue you are using. +# If you have no queueing system and want to run on a local machine, you +# can change all instances 'queue.pl' to run.pl (but be careful and run +# commands one by one: most recipes will exhaust the memory on your +# machine). queue.pl works with GridEngine (qsub). slurm.pl works +# with slurm. Different queues are configured differently, with different +# queue names and different ways of specifying things like memory; +# to account for these differences you can create and edit the file +# conf/queue.conf to match your queue's configuration. Search for +# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, +# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. -#a) JHU cluster options -export train_cmd="queue.pl -l arch=*64" -export decode_cmd="queue.pl -l arch=*64,mem_free=2G,ram_free=2G" -export mkgraph_cmd="queue.pl -l arch=*64,ram_free=4G,mem_free=4G" -export big_memory_cmd="queue.pl -l arch=*64,ram_free=8G,mem_free=8G" -export cuda_cmd="queue.pl -l gpu=1" -#export cuda_cmd="..." +export train_cmd="queue.pl --mem 2G" +export decode_cmd="queue.pl --mem 4G" +export mkgraph_cmd="queue.pl --mem 8G" - -#b) BUT cluster options -#export train_cmd="queue.pl -q all.q@@blade -l ram_free=1200M,mem_free=1200M" -#export decode_cmd="queue.pl -q all.q@@blade -l ram_free=1700M,mem_free=1700M" -#export decodebig_cmd="queue.pl -q all.q@@blade -l ram_free=4G,mem_free=4G" - -#export cuda_cmd="queue.pl -q long.q@@pco203 -l gpu=1" -#export cuda_cmd="queue.pl -q long.q@pcspeech-gpu" -#export mkgraph_cmd="queue.pl -q all.q@@servers -l ram_free=4G,mem_free=4G" - -#c) run it locally... -#export train_cmd=run.pl -#export decode_cmd=run.pl -export cuda_cmd=run.pl -#export mkgraph_cmd=run.pl +# the use of cuda_cmd is deprecated, but it's still used in this recipe. +export cuda_cmd="queue.pl --gpu 1" diff --git a/egs/chime2/s5/local/chime_format_data.sh b/egs/chime2/s5/local/chime_format_data.sh index 2c0728b943e..5870174aff4 100755 --- a/egs/chime2/s5/local/chime_format_data.sh +++ b/egs/chime2/s5/local/chime_format_data.sh @@ -17,11 +17,9 @@ echo "Preparing train and test data" srcdir=data/local/data lmdir=data/local/nist_lm -tmpdir=data/local/lm_tmp lexicon=data/local/lang_tmp/lexiconp.txt -mkdir -p $tmpdir -for x in test_eval92_clean test_eval92_noisy test_eval92_5k_clean test_eval92_5k_noisy dev_dt_05_clean dev_dt_05_reverb dev_dt_05_noisy dev_dt_20_clean dev_dt_20_reverb dev_dt_20_noisy train_si84_clean train_si84_reverb train_si84_noisy; do +for x in test_eval92_clean test_eval92_noisy test_eval92_5k_clean test_eval92_5k_noisy dev_dt_05_clean dev_dt_05_reverb dev_dt_05_noisy dev_dt_20_clean dev_dt_20_reverb dev_dt_20_noisy train_si84_clean train_si84_reverb train_si84_noisy; do mkdir -p data/$x cp $srcdir/${x}_wav.scp data/$x/wav.scp || exit 1; cp $srcdir/$x.txt data/$x/text || exit 1; @@ -42,25 +40,10 @@ for lm_suffix in bg tgpr tg bg_5k tgpr_5k tg_5k; do cp -r data/lang/* $test gunzip -c $lmdir/lm_${lm_suffix}.arpa.gz | \ - utils/find_arpa_oovs.pl $test/words.txt > $tmpdir/oovs_${lm_suffix}.txt - - # grep -v ' ' because the LM seems to have some strange and useless - # stuff in it with multiple 's in the history. Encountered some other similar - # things in a LM from Geoff. Removing all "illegal" combinations of and , - # which are supposed to occur only at being/end of utt. These can cause - # determinization failures of CLG [ends up being epsilon cycles]. - gunzip -c $lmdir/lm_${lm_suffix}.arpa.gz | \ - grep -v ' ' | \ - grep -v ' ' | \ - grep -v ' ' | \ - arpa2fst - | fstprint | \ - utils/remove_oovs.pl $tmpdir/oovs_${lm_suffix}.txt | \ - utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$test/words.txt \ - --osymbols=$test/words.txt --keep_isymbols=false --keep_osymbols=false | \ - fstrmepsilon | fstarcsort --sort_type=ilabel > $test/G.fst + arpa2fst --disambig-symbol=#0 \ + --read-symbol-table=$test/words.txt - $test/G.fst utils/validate_lang.pl $test || exit 1; done echo "Succeeded in formatting data." -rm -r $tmpdir diff --git a/egs/chime2/s5/path.sh b/egs/chime2/s5/path.sh index fee0b9b0c11..2d17b17a84a 100755 --- a/egs/chime2/s5/path.sh +++ b/egs/chime2/s5/path.sh @@ -1,4 +1,6 @@ export KALDI_ROOT=`pwd`/../../.. [ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh -export PATH=$PWD/utils/:$KALDI_ROOT/src/bin:$KALDI_ROOT/tools/openfst/bin:$KALDI_ROOT/src/fstbin/:$KALDI_ROOT/src/gmmbin/:$KALDI_ROOT/src/featbin/:$KALDI_ROOT/src/lm/:$KALDI_ROOT/src/sgmmbin/:$KALDI_ROOT/src/sgmm2bin/:$KALDI_ROOT/src/fgmmbin/:$KALDI_ROOT/src/latbin/:$KALDI_ROOT/src/nnetbin:$KALDI_ROOT/src/nnet2bin/:$KALDI_ROOT/src/kwsbin:$PWD:$PATH +export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH +[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1 +. $KALDI_ROOT/tools/config/common_path.sh export LC_ALL=C diff --git a/egs/chime3/README.txt b/egs/chime3/README.txt index 6c55689b298..3d52c0a8a04 100644 --- a/egs/chime3/README.txt +++ b/egs/chime3/README.txt @@ -6,7 +6,7 @@ If you use these data in a publication, please cite: Jon Barker, Ricard Marxer, Emmanuel Vincent, and Shinji Watanabe, The third 'CHiME' Speech Separation and Recognition Challenge: Dataset, task and baselines, submitted to IEEE 2015 Automatic Speech Recognition -and Understanding Workshop (ASRU), 2015. +and Understanding Workshop (ASRU), 2015. Quick instruction: 1) Download CHiME3 data @@ -33,6 +33,7 @@ nohup ./run.sh > run.log local/run_gmm.sh local/run_dnn.sh +local/run_lmrescore.sh You can put in your working directory. But please make sure to use the same directory structure and naming convention with those of the @@ -45,6 +46,7 @@ You don't have to execute local/run_init.sh twice. enhan= GMM clean training: exp/tri3b_tr05_orig_clean/best_wer_$enhan.result GMM multi training: exp/tri3b_tr05_multi_$enhan/best_wer_$enhan.result -DNN multi training: exp/tri4a_dnn_tr05_multi_${enhan}_smbr_i1lats/best_wer_${enhan}.result +DNN multi training: exp/tri4a_dnn_tr05_multi_${enhan}_smbr_i1lats/best_wer_${enhan}.result +DNN multi training with LM rescoring: exp/tri4a_dnn_tr05_multi_${enhan}_smbr_i1lats_lmrescore/best_wer_${enhan}_rnnlm_5k_h300_w0.5_n100.result Note that training on clean data means original WSJ0 data only (no booth data) diff --git a/egs/chime3/s5/RESULTS b/egs/chime3/s5/RESULTS new file mode 100644 index 00000000000..7e00f49542a --- /dev/null +++ b/egs/chime3/s5/RESULTS @@ -0,0 +1,95 @@ +# The result based on Hori et al, "The MERL/SRI system for the 3rd CHiME challenge using beamforming, +# robust feature extraction, and advanced speech recognition," in Proc. ASRU'15 +# Note that the following result is different from that in the paper since we don't include +# SRI's robust features and system combination + +GMM multi-condition +exp/tri3b_tr05_multi_noisy/best_wer_noisy.result +------------------- +best overall dt05 WER 18.49% (language model weight = 10) +------------------- +dt05_simu WER: 18.36% (Average), 18.72% (BUS), 22.46% (CAFE), 14.97% (PEDESTRIAN), 17.27% (STREET) +------------------- +dt05_real WER: 18.62% (Average), 26.17% (BUS), 17.18% (CAFE), 12.92% (PEDESTRIAN), 18.20% (STREET) +------------------- +et05_simu WER: 21.40% (Average), 19.14% (BUS), 24.08% (CAFE), 21.68% (PEDESTRIAN), 20.69% (STREET) +------------------- +et05_real WER: 32.54% (Average), 48.76% (BUS), 32.84% (CAFE), 27.30% (PEDESTRIAN), 21.29% (STREET) +------------------- + + +GMM with beamformit +exp/tri3b_tr05_multi_beamformit_5mics/best_wer_beamformit_5mics.result +------------------- +best overall dt05 WER 13.83% (language model weight = 11) +------------------- +dt05_simu WER: 14.87% (Average), 12.36% (BUS), 17.95% (CAFE), 12.92% (PEDESTRIAN), 16.27% (STREET) +------------------- +dt05_real WER: 12.78% (Average), 16.17% (BUS), 12.20% (CAFE), 9.62% (PEDESTRIAN), 13.14% (STREET) +------------------- +et05_simu WER: 23.13% (Average), 16.27% (BUS), 24.86% (CAFE), 26.06% (PEDESTRIAN), 25.33% (STREET) +------------------- +et05_real WER: 23.06% (Average), 31.31% (BUS), 21.85% (CAFE), 21.86% (PEDESTRIAN), 17.22% (STREET) +------------------- + + +DNN +exp/tri4a_dnn_tr05_multi_beamformit_5mics/best_wer_beamformit_5mics.result +------------------- +best overall dt05 WER 10.34% (language model weight = 10) +------------------- +dt05_simu WER: 11.08% (Average), 10.09% (BUS), 13.01% (CAFE), 9.23% (PEDESTRIAN), 12.01% (STREET) +------------------- +dt05_real WER: 9.59% (Average), 12.67% (BUS), 9.41% (CAFE), 6.65% (PEDESTRIAN), 9.64% (STREET) +------------------- +et05_simu WER: 17.48% (Average), 12.57% (BUS), 18.04% (CAFE), 18.64% (PEDESTRIAN), 20.66% (STREET) +------------------- +et05_real WER: 17.89% (Average), 26.77% (BUS), 16.57% (CAFE), 14.85% (PEDESTRIAN), 13.37% (STREET) +------------------- + + +DNN sMBR +exp/tri4a_dnn_tr05_multi_beamformit_5mics_smbr_i1lats/best_wer_beamformit_5mics.result +------------------- +best overall dt05 WER 9.24% (language model weight = 10) + (Number of iterations = 4) +------------------- +dt05_simu WER: 9.90% (Average), 9.38% (BUS), 11.70% (CAFE), 8.14% (PEDESTRIAN), 10.40% (STREET) +------------------- +dt05_real WER: 8.58% (Average), 11.54% (BUS), 8.36% (CAFE), 5.74% (PEDESTRIAN), 8.67% (STREET) +------------------- +et05_simu WER: 16.01% (Average), 11.97% (BUS), 16.49% (CAFE), 16.51% (PEDESTRIAN), 19.07% (STREET) +------------------- +et05_real WER: 15.88% (Average), 23.54% (BUS), 14.21% (CAFE), 13.42% (PEDESTRIAN), 12.35% (STREET) +------------------- + + +5-gram rescoring +exp/tri4a_dnn_tr05_multi_beamformit_5mics_smbr_lmrescore/best_wer_beamformit_5mics_5gkn_5k.result +------------------- +best overall dt05 WER 7.73% (language model weight = 10) +------------------- +dt05_simu WER: 8.43% (Average), 7.83% (BUS), 10.19% (CAFE), 6.87% (PEDESTRIAN), 8.83% (STREET) +------------------- +dt05_real WER: 7.02% (Average), 9.13% (BUS), 7.08% (CAFE), 4.62% (PEDESTRIAN), 7.27% (STREET) +------------------- +et05_simu WER: 13.94% (Average), 10.87% (BUS), 14.42% (CAFE), 13.69% (PEDESTRIAN), 16.79% (STREET) +------------------- +et05_real WER: 14.12% (Average), 21.57% (BUS), 12.22% (CAFE), 11.36% (PEDESTRIAN), 11.32% (STREET) +------------------- + + +RNNLM +exp/tri4a_dnn_tr05_multi_beamformit_5mics_smbr_lmrescore/best_wer_beamformit_5mics_rnnlm_5k_h300_w0.5_n100.result +------------------- +best overall dt05 WER 7.14% (language model weight = 6) +------------------- +dt05_simu WER: 7.83% (Average), 7.29% (BUS), 9.62% (CAFE), 6.08% (PEDESTRIAN), 8.33% (STREET) +------------------- +dt05_real WER: 6.45% (Average), 8.48% (BUS), 6.19% (CAFE), 4.53% (PEDESTRIAN), 6.61% (STREET) +------------------- +et05_simu WER: 12.86% (Average), 9.92% (BUS), 13.35% (CAFE), 12.59% (PEDESTRIAN), 15.60% (STREET) +------------------- +et05_real WER: 12.79% (Average), 19.14% (BUS), 11.39% (CAFE), 10.33% (PEDESTRIAN), 10.31% (STREET) +------------------- + diff --git a/egs/chime3/s5/conf/ami.cfg b/egs/chime3/s5/conf/ami.cfg new file mode 100755 index 00000000000..70fdd858651 --- /dev/null +++ b/egs/chime3/s5/conf/ami.cfg @@ -0,0 +1,50 @@ +#BeamformIt sample configuration file for AMI data (http://groups.inf.ed.ac.uk/ami/download/) + +# scrolling size to compute the delays +scroll_size = 250 + +# cross correlation computation window size +window_size = 500 + +#amount of maximum points for the xcorrelation taken into account +nbest_amount = 4 + +#flag wether to apply an automatic noise thresholding +do_noise_threshold = 1 + +#Percentage of frames with lower xcorr taken as noisy +noise_percent = 10 + +######## acoustic modelling parameters + +#transition probabilities weight for multichannel decoding +trans_weight_multi = 25 +trans_weight_nbest = 25 + +### + +#flag wether to print the feaures after setting them, or not +print_features = 1 + +#flag wether to use the bad frames in the sum process +do_avoid_bad_frames = 1 + +#flag to use the best channel (SNR) as a reference +#defined from command line +do_compute_reference = 1 + +#flag wether to use a uem file or not(process all the file) +do_use_uem_file = 0 + +#flag wether to use an adaptative weights scheme or fixed weights +do_adapt_weights = 1 + +#flag wether to output the sph files or just run the system to create the auxiliary files +do_write_sph_files = 1 + +####directories where to store/retrieve info#### +#channels_file = ./cfg-files/channels + +#show needs to be passed as argument normally, here a default one is given just in case +#show_id = Ttmp + diff --git a/egs/chime3/s5/local/chime3_beamform.sh b/egs/chime3/s5/local/chime3_beamform.sh new file mode 100755 index 00000000000..170a37ccd84 --- /dev/null +++ b/egs/chime3/s5/local/chime3_beamform.sh @@ -0,0 +1,94 @@ +#!/bin/bash + +# Copyright 2015, Mitsubishi Electric Research Laboratories, MERL (Author: Shinji Watanabe) + +. ./cmd.sh +. ./path.sh + +# Config: +nj=10 +cmd=run.pl + +. utils/parse_options.sh || exit 1; + +if [ $# != 2 ]; then + echo "Wrong #arguments ($#, expected 2)" + echo "Usage: local/chime3_beamform.sh [options] " + echo "main options (for others, see top of script file)" + echo " --nj # number of parallel jobs" + echo " --cmd # Command to run in parallel with" + exit 1; +fi + +sdir=$1 +odir=$2 +wdir=data/local/beamforming + +if [ -z $BEAMFORMIT ] ; then + export BEAMFORMIT=$KALDI_ROOT/tools/BeamformIt +fi +export PATH=${PATH}:$BEAMFORMIT +! hash BeamformIt && echo "Missing BeamformIt, run 'cd ../../../tools/; make beamformit;'" && exit 1 + +# Set bash to 'debug' mode, it will exit on : +# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands', +set -e +set -u +set -o pipefail + +mkdir -p $odir +mkdir -p $wdir/log + +# we use the following channel signals, and remove 2nd channel signal, which located on the back of +# tablet, and behaves very different from the other front channel signals. +bmf="1 3 4 5 6" +echo "Will use the following channels: $bmf" +# number of channels +numch=`echo $bmf | tr ' ' '\n' | wc -l` +echo "the number of channels: $numch" + +# wavfiles.list can be used as the name of the output files +output_wavfiles=$wdir/wavfiles.list +find $sdir/*{simu,real} | grep CH1.wav \ + | awk -F '/' '{print $(NF-1) "/" $NF}' | sed -e "s/\.CH1\.wav//" | sort > $output_wavfiles + +# this is an input file list of the microphones +# format: 1st_wav 2nd_wav ... nth_wav +input_arrays=$wdir/channels_$numch +for x in `cat $output_wavfiles`; do + echo -n "$x" + for ch in $bmf; do + echo -n " $x.CH$ch.wav" + done + echo "" +done > $input_arrays + +# split the list for parallel processing +split_wavfiles="" +for n in `seq $nj`; do + split_wavfiles="$split_wavfiles $output_wavfiles.$n" +done +utils/split_scp.pl $output_wavfiles $split_wavfiles || exit 1; + +echo -e "Beamforming\n" +# making a shell script for each job +for n in `seq $nj`; do +cat << EOF > $wdir/log/beamform.$n.sh +while read line; do + $BEAMFORMIT/BeamformIt -s \$line -c $input_arrays \ + --config_file `pwd`/conf/ami.cfg \ + --source_dir $sdir \ + --result_dir $odir +done < $output_wavfiles.$n +EOF +done +# making a subdirectory for the output wav files +for x in `awk -F '/' '{print $1}' $output_wavfiles | sort | uniq`; do + mkdir -p $odir/$x +done + +chmod a+x $wdir/log/beamform.*.sh +$cmd JOB=1:$nj $wdir/log/beamform.JOB.log \ + $wdir/log/beamform.JOB.sh + +echo "`basename $0` Done." diff --git a/egs/chime3/s5/local/chime3_calc_wers.sh b/egs/chime3/s5/local/chime3_calc_wers.sh index 83d9b7f4251..b083faec56b 100755 --- a/egs/chime3/s5/local/chime3_calc_wers.sh +++ b/egs/chime3/s5/local/chime3_calc_wers.sh @@ -6,7 +6,7 @@ set -e if [ $# -ne 2 ]; then - printf "\nUSAGE: %s \n\n" `basename $0` + printf "\nUSAGE: %s \n\n" `basename $0` printf "%s exp/tri3b_tr05_sr_noisy noisy\n\n" `basename $0` exit 1; fi @@ -28,7 +28,7 @@ for a in `find $dir/decode_tgpr_5k_dt05_real_$enhan/ | grep "\/wer_" | awk -F'[/ cat $dir/decode_tgpr_5k_dt05_{real,simu}_$enhan/$a | grep WER | awk '{err+=$4} {wrd+=$6} END{printf("%.2f\n",err/wrd*100)}' else cat $dir/decode_tgpr_5k_dt05_real_$enhan/$a | grep WER | awk '{err+=$4} {wrd+=$6} END{printf("%.2f\n",err/wrd*100)}' - fi + fi done | sort -n -k 2 | head -n 1 > $dir/log/best_wer_$enhan lmw=`cut -f 1 -d" " $dir/log/best_wer_$enhan | cut -f 2 -d"_"` diff --git a/egs/chime3/s5/local/chime3_calc_wers_smbr.sh b/egs/chime3/s5/local/chime3_calc_wers_smbr.sh old mode 100644 new mode 100755 index ac63c0febb0..178e7a78b9c --- a/egs/chime3/s5/local/chime3_calc_wers_smbr.sh +++ b/egs/chime3/s5/local/chime3_calc_wers_smbr.sh @@ -6,7 +6,7 @@ set -e if [ $# -ne 3 ]; then - printf "\nUSAGE: %s \n\n" `basename $0` + printf "\nUSAGE: %s \n\n" `basename $0` printf "%s exp/tri3b_tr05_sr_noisy noisy exp/tri4a_dnn_tr05_sr_noisy/graph_tgpr_5k\n\n" `basename $0` exit 1; fi diff --git a/egs/chime3/s5/local/chime3_train_lms.sh b/egs/chime3/s5/local/chime3_train_lms.sh new file mode 100755 index 00000000000..984ef766b2a --- /dev/null +++ b/egs/chime3/s5/local/chime3_train_lms.sh @@ -0,0 +1,148 @@ +#!/bin/bash + +# Modified from the script for CHiME3 baseline +# Copyright 2015, Mitsubishi Electric Research Laboratories, MERL (Author: Takaaki Hori) + +# Config: +order=5 # n-gram order + +. utils/parse_options.sh || exit 1; + +. ./path.sh + +if [ $# -ne 1 ]; then + printf "\nUSAGE: %s \n\n" `basename $0` + echo "Please specifies a CHiME3 root directory" + echo "If you use kaldi scripts distributed in the CHiME3 data," + echo "It would be `pwd`/../.." + exit 1; +fi + +# check data directories +chime3_data=$1 +wsj0_data=$chime3_data/data/WSJ0 # directory of WSJ0 in CHiME3. You can also specify your WSJ0 corpus directory +if [ ! -d $chime3_data ]; then + echo "$chime3_data does not exist. Please specify chime3 data root correctly" && exit 1 +fi +if [ ! -d $wsj0_data ]; then + echo "$wsj0_data does not exist. Please specify WSJ0 corpus directory" && exit 1 +fi +lm_train=$wsj0_data/wsj0/doc/lng_modl/lm_train/np_data + +# check whether run_init is executed +if [ ! -d data/lang ]; then + echo "error, execute local/run_init.sh, first" + exit 1; +fi + +# lm directories +dir=data/local/local_lm +srcdir=data/local/nist_lm +mkdir -p $dir + +# check srilm ngram +! which ngram-count \ + && echo "SRILM tools not installed, which are required for LM training" && exit 1; + +# extract 5k vocabulary from a baseline language model +srclm=$srcdir/lm_tgpr_5k.arpa.gz +if [ -f $srclm ]; then + echo "Getting vocabulary from a baseline language model"; + gunzip -c $srclm | awk 'BEGIN{unig=0}{ + if(unig==0){ + if($1=="\\1-grams:"){unig=1}} + else { + if ($1 != "") { + if ($1=="\\2-grams:" || $1=="\\end\\") {exit} + else {print $2}} + }}' > $dir/vocab_5k.txt +else + echo "Language model $srclm does not exist" && exit 1; +fi + +# collect training data from WSJ0 +touch $dir/train.gz +if [ `du -m $dir/train.gz | cut -f 1` -eq 63 ]; then + echo "Not getting training data again [already exists]"; +else + echo "Collecting training data from $lm_train"; + gunzip -c $lm_train/{87,88,89}/*.z \ + | awk -v voc=$dir/vocab_5k.txt ' + BEGIN{ while((getline0) { invoc[$1]=1; }} + /^ "); } + } + printf("\n"); + }' | gzip -c > $dir/train.gz +fi + +# get validation data from CHiME3 dev set +touch $dir/valid.gz +if [ `du -k $dir/valid.gz | cut -f 1` -eq 68 ]; then + echo "Not getting validation data again [already exists]"; +else + echo "Collecting validation data from $chime3_data/data/transcriptions"; + cut -d" " -f2- $chime3_data/data/transcriptions/dt05_real.trn_all \ + $chime3_data/data/transcriptions/dt05_simu.trn_all \ + |gzip -c > $dir/valid.gz +fi + +# train a large n-gram language model +lm_suffix=${order}gkn_5k +if [ -f $dir/lm_${lm_suffix}.arpa.gz ]; then + echo "A $order-gram language model aready exists and is not constructed again" + echo "To reconstruct, remove $dir/$dir/lm_${lm_suffix}.arpa.gz first" +else + echo "Training a $order-gram language model" + ngram-count -text $dir/train.gz -order $order \ + -vocab $dir/vocab_5k.txt -unk -map-unk "" \ + -gt2min 1 -gt3min 1 -gt4min 2 -gt5min 2 \ + -interpolate -kndiscount \ + -lm $dir/lm_${lm_suffix}.arpa.gz +fi +echo "Checking validation perplexity of $order-gram language model" +ngram -order $order -ppl $dir/valid.gz -lm $dir/lm_${lm_suffix}.arpa.gz +# e.g. 5-gram perplexity: +# file data/local/local_lm/valid.txt: 3280 sentences, 54239 words, 3 OOVs +# 0 zeroprobs, logprob= -96775.5 ppl= 48.1486 ppl1= 60.8611 + +# Next, create the corresponding FST and lang_test_* directory. +echo "Preparing language models for test" +tmpdir=data/local/lm_tmp +lexicon=data/local/lang_tmp/lexiconp.txt +mkdir -p $tmpdir + +test=data/lang_test_${lm_suffix} +mkdir -p $test +for f in phones.txt words.txt phones.txt L.fst L_disambig.fst \ + phones; do + cp -r data/lang/$f $test +done +gunzip -c $dir/lm_${lm_suffix}.arpa.gz | \ + arpa2fst --disambig-symbol=#0 \ + --read-symbol-table=$test/words.txt - $test/G.fst +fstisstochastic $test/G.fst +# The output is like: +# 9.14233e-05 -0.259833 +# we do expect the first of these 2 numbers to be close to zero (the second is +# nonzero because the backoff weights make the states sum to >1). +# Because of the fiasco for these particular LMs, the first number is not +# as close to zero as it could be. + +# Everything below is only for diagnostic. +# Checking that G has no cycles with empty words on them (e.g. , ); +# this might cause determinization failure of CLG. +# #0 is treated as an empty word. +mkdir -p $tmpdir/g +awk '{if(NF==1){ printf("0 0 %s %s\n", $1,$1); }} END{print "0 0 #0 #0"; print "0";}' \ + < "$lexicon" >$tmpdir/g/select_empty.fst.txt +fstcompile --isymbols=$test/words.txt --osymbols=$test/words.txt $tmpdir/g/select_empty.fst.txt | \ + fstarcsort --sort_type=olabel | fstcompose - $test/G.fst > $tmpdir/g/empty_words.fst +fstinfo $tmpdir/g/empty_words.fst | grep cyclic | grep -w 'y' && + echo "Language model has cycles with empty words" && exit 1 +rm -r $tmpdir/g + +echo "Succeeded in preparing a large ${order}-gram LM" +rm -r $tmpdir diff --git a/egs/chime3/s5/local/chime3_train_rnnlms.sh b/egs/chime3/s5/local/chime3_train_rnnlms.sh new file mode 100755 index 00000000000..429ca828aa3 --- /dev/null +++ b/egs/chime3/s5/local/chime3_train_rnnlms.sh @@ -0,0 +1,111 @@ +#!/bin/bash + +# Copyright 2015, Mitsubishi Electric Research Laboratories, MERL (Author: Takaaki Hori) + +# Config: +hidden=300 # Num-hidden units +class=200 # Num-classes +rnnlm_ver=rnnlm-0.3e # version of RNNLM to use +threads=1 # for RNNLM-HS +bptt=4 # length of BPTT unfolding in RNNLM +bptt_block=10 # length of BPTT unfolding in RNNLM + +. utils/parse_options.sh || exit 1; + +. ./path.sh +. ./cmd.sh ## You'll want to change cmd.sh to something that will work on your system. + ## This relates to the queue. + +if [ $# -ne 1 ]; then + printf "\nUSAGE: %s \n\n" `basename $0` + echo "Please specifies a CHiME3 root directory" + echo "If you use kaldi scripts distributed in the CHiME3 data," + echo "It would be `pwd`/../.." + exit 1; +fi + +# check data directories +chime3_data=$1 +wsj0_data=$chime3_data/data/WSJ0 # directory of WSJ0 in CHiME3. You can also specify your WSJ0 corpus directory +if [ ! -d $chime3_data ]; then + echo "$chime3_data does not exist. Please specify chime3 data root correctly" && exit 1 +fi +if [ ! -d $wsj0_data ]; then + echo "$wsj0_data does not exist. Please specify WSJ0 corpus directory" && exit 1 +fi +lm_train=$wsj0_data/wsj0/doc/lng_modl/lm_train/np_data + +# lm directories +dir=data/local/local_lm +srcdir=data/local/nist_lm +mkdir -p $dir + +# extract 5k vocabulary from a baseline language model +srclm=$srcdir/lm_tgpr_5k.arpa.gz +if [ -f $srclm ]; then + echo "Getting vocabulary from a baseline language model"; + gunzip -c $srclm | awk 'BEGIN{unig=0}{ + if(unig==0){ + if($1=="\\1-grams:"){unig=1}} + else { + if ($1 != "") { + if ($1=="\\2-grams:" || $1=="\\end\\") {exit} + else {print $2}} + }}' | sed "s///" > $dir/vocab_5k.rnn +else + echo "Language model $srclm does not exist" && exit 1; +fi + +# collect training data from WSJ0 +touch $dir/train.rnn +if [ `du -m $dir/train.rnn | cut -f 1` -eq 223 ]; then + echo "Not getting training data again [already exists]"; +else + echo "Collecting training data from $lm_train"; + gunzip -c $lm_train/{87,88,89}/*.z \ + | awk -v voc=$dir/vocab_5k.rnn ' + BEGIN{ while((getline0) { invoc[$1]=1; }} + /^ "); } + } + printf("\n"); + }' > $dir/train.rnn +fi + +# get validation data from CHiME3 dev set +touch $dir/valid.rnn +if [ `cat $dir/valid.rnn | wc -w` -eq 54239 ]; then + echo "Not getting validation data again [already exists]"; +else + echo "Collecting validation data from $chime3_data/data/transcriptions"; + cut -d" " -f2- $chime3_data/data/transcriptions/dt05_real.trn_all \ + $chime3_data/data/transcriptions/dt05_simu.trn_all \ + > $dir/valid.rnn +fi + +# RNN language model traing +$KALDI_ROOT/tools/extras/check_for_rnnlm.sh "$rnnlm_ver" || exit 1 + +# train a RNN language model +rnnmodel=$dir/rnnlm_5k_h${hidden}_bptt${bptt} +if [ -f $rnnmodel ]; then + echo "A RNN language model aready exists and is not constructed again" + echo "To reconstruct, remove $rnnmodel first" +else + echo "Training a RNN language model with $rnnlm_ver" + echo "(runtime log is written to $dir/rnnlm.log)" + $train_cmd $dir/rnnlm.log \ + $KALDI_ROOT/tools/$rnnlm_ver/rnnlm -train $dir/train.rnn -valid $dir/valid.rnn \ + -rnnlm $rnnmodel -hidden $hidden -class $class \ + -rand-seed 1 -independent -debug 1 -bptt $bptt -bptt-block $bptt_block || exit 1; +fi + +# store in a RNNLM directory with necessary files +rnndir=data/lang_test_rnnlm_5k_h${hidden} +mkdir -p $rnndir +cp $rnnmodel $rnndir/rnnlm +grep -v -e "" -e "" $dir/vocab_5k.rnn > $rnndir/wordlist.rnn +touch $rnndir/unk.probs # make an empty file because we don't know unk-word probs. + diff --git a/egs/chime3/s5/local/clean_chime3_format_data.sh b/egs/chime3/s5/local/clean_chime3_format_data.sh index d3a2c73471c..f2d81bc5324 100755 --- a/egs/chime3/s5/local/clean_chime3_format_data.sh +++ b/egs/chime3/s5/local/clean_chime3_format_data.sh @@ -20,7 +20,7 @@ tmpdir=data/local/lm_tmp lexicon=data/local/lang_tmp/lexiconp.txt mkdir -p $tmpdir -for x in et05_orig_clean dt05_orig_clean tr05_orig_clean; do +for x in et05_orig_clean dt05_orig_clean tr05_orig_clean; do mkdir -p data/$x cp $srcdir/${x}_wav.scp data/$x/wav.scp || exit 1; cp $srcdir/$x.txt data/$x/text || exit 1; @@ -43,29 +43,15 @@ for lm_suffix in tgpr_5k; do cp -r data/lang/$f $test done gunzip -c $lmdir/lm_${lm_suffix}.arpa.gz | \ - utils/find_arpa_oovs.pl $test/words.txt > $tmpdir/oovs_${lm_suffix}.txt - - # grep -v ' ' because the LM seems to have some strange and useless - # stuff in it with multiple 's in the history. Encountered some other similar - # things in a LM from Geoff. Removing all "illegal" combinations of and , - # which are supposed to occur only at being/end of utt. These can cause - # determinization failures of CLG [ends up being epsilon cycles]. - gunzip -c $lmdir/lm_${lm_suffix}.arpa.gz | \ - grep -v ' ' | \ - grep -v ' ' | \ - grep -v ' ' | \ - arpa2fst - | fstprint | \ - utils/remove_oovs.pl $tmpdir/oovs_${lm_suffix}.txt | \ - utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$test/words.txt \ - --osymbols=$test/words.txt --keep_isymbols=false --keep_osymbols=false | \ - fstrmepsilon | fstarcsort --sort_type=ilabel > $test/G.fst + arpa2fst --disambig-symbol=#0 \ + --read-symbol-table=$test/words.txt - $test/G.fst fstisstochastic $test/G.fst - # The output is like: - # 9.14233e-05 -0.259833 - # we do expect the first of these 2 numbers to be close to zero (the second is - # nonzero because the backoff weights make the states sum to >1). - # Because of the fiasco for these particular LMs, the first number is not - # as close to zero as it could be. + # The output is like: + # 9.14233e-05 -0.259833 + # we do expect the first of these 2 numbers to be close to zero (the second is + # nonzero because the backoff weights make the states sum to >1). + # Because of the fiasco for these particular LMs, the first number is not + # as close to zero as it could be. # Everything below is only for diagnostic. # Checking that G has no cycles with empty words on them (e.g. , ); @@ -76,7 +62,7 @@ for lm_suffix in tgpr_5k; do < "$lexicon" >$tmpdir/g/select_empty.fst.txt fstcompile --isymbols=$test/words.txt --osymbols=$test/words.txt $tmpdir/g/select_empty.fst.txt | \ fstarcsort --sort_type=olabel | fstcompose - $test/G.fst > $tmpdir/g/empty_words.fst - fstinfo $tmpdir/g/empty_words.fst | grep cyclic | grep -w 'y' && + fstinfo $tmpdir/g/empty_words.fst | grep cyclic | grep -w 'y' && echo "Language model has cycles with empty words" && exit 1 rm -r $tmpdir/g done diff --git a/egs/chime3/s5/local/run_dnn.sh b/egs/chime3/s5/local/run_dnn.sh index 1795983ce17..668236dd341 100755 --- a/egs/chime3/s5/local/run_dnn.sh +++ b/egs/chime3/s5/local/run_dnn.sh @@ -12,6 +12,12 @@ . ./cmd.sh ## You'll want to change cmd.sh to something that will work on your system. ## This relates to the queue. +# Config: +nj=30 +stage=0 # resume training with --stage=N + +. utils/parse_options.sh || exit 1; + # This is a shell script, but it's recommended that you run the commands one by # one by copying and pasting into the shell. @@ -22,12 +28,16 @@ if [ $# -ne 2 ]; then exit 1; fi -nj=30 - -# enhan data +# set enhanced data enhan=$1 enhan_data=$2 +# Set bash to 'debug' mode, it will exit on : +# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands', +set -e +set -u +set -o pipefail + # check whether run_init is executed if [ ! -d data/lang ]; then echo "error, execute local/run_init.sh, first" @@ -40,53 +50,77 @@ if [ ! -d exp/tri3b_tr05_multi_$enhan ]; then exit 1; fi -# make 40-dim fbank features for enhan data -fbankdir=fbank/$enhan -mkdir -p data-fbank -for x in dt05_real_$enhan et05_real_$enhan tr05_real_$enhan dt05_simu_$enhan et05_simu_$enhan tr05_simu_$enhan; do - cp -r data/$x data-fbank - steps/make_fbank.sh --nj $nj \ - data-fbank/$x exp/make_fbank/$x $fbankdir || exit 1; -done +# get alignments +if [ $stage -le 0 ]; then + steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \ + data/tr05_multi_$enhan data/lang exp/tri3b_tr05_multi_$enhan exp/tri3b_tr05_multi_${enhan}_ali + steps/align_fmllr.sh --nj 4 --cmd "$train_cmd" \ + data/dt05_multi_$enhan data/lang exp/tri3b_tr05_multi_$enhan exp/tri3b_tr05_multi_${enhan}_ali_dt05 +fi -# make mixed training set from real and simulation enhancement training data -# multi = simu + real -utils/combine_data.sh data-fbank/tr05_multi_$enhan data-fbank/tr05_simu_$enhan data-fbank/tr05_real_$enhan -utils/combine_data.sh data-fbank/dt05_multi_$enhan data-fbank/dt05_simu_$enhan data-fbank/dt05_real_$enhan -utils/combine_data.sh data-fbank/et05_multi_$enhan data-fbank/et05_simu_$enhan data-fbank/et05_real_$enhan +# make fmllr feature for training multi = simu + real +gmmdir=exp/tri3b_tr05_multi_${enhan}_ali +data_fmllr=data-fmllr-tri3b +mkdir -p $data_fmllr +fmllrdir=fmllr-tri3b/$enhan +if [ $stage -le 1 ]; then + for x in tr05_real_$enhan tr05_simu_$enhan; do + steps/nnet/make_fmllr_feats.sh --nj 4 --cmd "$train_cmd" \ + --transform-dir $gmmdir \ + $data_fmllr/$x data/$x $gmmdir exp/make_fmllr_tri3b/$x $fmllrdir + done +fi + +# make fmllr feature for dev and eval +gmmdir=exp/tri3b_tr05_multi_${enhan} +if [ $stage -le 2 ]; then + for x in dt05_real_$enhan et05_real_$enhan dt05_simu_$enhan et05_simu_$enhan; do + steps/nnet/make_fmllr_feats.sh --nj 4 --cmd "$train_cmd" \ + --transform-dir $gmmdir/decode_tgpr_5k_$x \ + $data_fmllr/$x data/$x $gmmdir exp/make_fmllr_tri3b/$x $fmllrdir + done +fi -# get alignment -steps/align_fmllr.sh --nj $nj \ - data/tr05_multi_$enhan data/lang exp/tri3b_tr05_multi_$enhan exp/tri3b_tr05_multi_${enhan}_ali || exit 1; -steps/align_fmllr.sh --nj 4 \ - data/dt05_multi_$enhan data/lang exp/tri3b_tr05_multi_$enhan exp/tri3b_tr05_multi_${enhan}_ali_dt05 || exit 1; +# make mixed training set from real and simulation enhanced data +# multi = simu + real +if [ $stage -le 3 ]; then + utils/combine_data.sh $data_fmllr/tr05_multi_$enhan $data_fmllr/tr05_simu_$enhan $data_fmllr/tr05_real_$enhan + utils/combine_data.sh $data_fmllr/dt05_multi_$enhan $data_fmllr/dt05_simu_$enhan $data_fmllr/dt05_real_$enhan + utils/combine_data.sh $data_fmllr/et05_multi_$enhan $data_fmllr/et05_simu_$enhan $data_fmllr/et05_real_$enhan +fi # pre-train dnn dir=exp/tri4a_dnn_pretrain_tr05_multi_$enhan -$cuda_cmd $dir/_pretrain_dbn.log \ - steps/nnet/pretrain_dbn.sh --nn-depth 7 --rbm-iter 3 data-fbank/tr05_multi_$enhan $dir +if [ $stage -le 4 ]; then + $cuda_cmd $dir/_pretrain_dbn.log \ + steps/nnet/pretrain_dbn.sh --nn-depth 7 --rbm-iter 3 $data_fmllr/tr05_multi_$enhan $dir +fi # train dnn dir=exp/tri4a_dnn_tr05_multi_$enhan ali=exp/tri3b_tr05_multi_${enhan}_ali -ali_dev=exp/tri3b_tr05_multi_${enhan}_ali_dt05 +ali_dev=exp/tri3b_tr05_multi_${enhan}_ali_dt05 feature_transform=exp/tri4a_dnn_pretrain_tr05_multi_$enhan/final.feature_transform dbn=exp/tri4a_dnn_pretrain_tr05_multi_$enhan/7.dbn -$cuda_cmd $dir/_train_nnet.log \ -steps/nnet/train.sh --feature-transform $feature_transform --dbn $dbn --hid-layers 0 --learn-rate 0.008 \ -data-fbank/tr05_multi_$enhan data-fbank/dt05_multi_$enhan data/lang $ali $ali_dev $dir || exit 1; - -# decode enhan speech -utils/mkgraph.sh data/lang_test_tgpr_5k $dir $dir/graph_tgpr_5k || exit 1; -steps/nnet/decode.sh --nj 4 --num-threads 4 --acwt 0.10 --config conf/decode_dnn.config \ - $dir/graph_tgpr_5k data-fbank/dt05_real_$enhan $dir/decode_tgpr_5k_dt05_real_$enhan & -steps/nnet/decode.sh --nj 4 --num-threads 4 --acwt 0.10 --config conf/decode_dnn.config \ - $dir/graph_tgpr_5k data-fbank/dt05_simu_$enhan $dir/decode_tgpr_5k_dt05_simu_$enhan & -steps/nnet/decode.sh --nj 4 --num-threads 4 --acwt 0.10 --config conf/decode_dnn.config \ - $dir/graph_tgpr_5k data-fbank/et05_real_$enhan $dir/decode_tgpr_5k_et05_real_$enhan & -steps/nnet/decode.sh --nj 4 --num-threads 4 --acwt 0.10 --config conf/decode_dnn.config \ - $dir/graph_tgpr_5k data-fbank/et05_simu_$enhan $dir/decode_tgpr_5k_et05_simu_$enhan & -wait; +if [ $stage -le 5 ]; then + $cuda_cmd $dir/_train_nnet.log \ + steps/nnet/train.sh --feature-transform $feature_transform --dbn $dbn --hid-layers 0 --learn-rate 0.008 \ + $data_fmllr/tr05_multi_$enhan $data_fmllr/dt05_multi_$enhan data/lang $ali $ali_dev $dir +fi + +# decode enhanced speech +if [ $stage -le 6 ]; then + utils/mkgraph.sh data/lang_test_tgpr_5k $dir $dir/graph_tgpr_5k + steps/nnet/decode.sh --nj 4 --num-threads 3 --cmd "$decode_cmd" --acwt 0.10 --config conf/decode_dnn.config \ + $dir/graph_tgpr_5k $data_fmllr/dt05_real_$enhan $dir/decode_tgpr_5k_dt05_real_$enhan & + steps/nnet/decode.sh --nj 4 --num-threads 3 --cmd "$decode_cmd" --acwt 0.10 --config conf/decode_dnn.config \ + $dir/graph_tgpr_5k $data_fmllr/dt05_simu_$enhan $dir/decode_tgpr_5k_dt05_simu_$enhan & + steps/nnet/decode.sh --nj 4 --num-threads 3 --cmd "$decode_cmd" --acwt 0.10 --config conf/decode_dnn.config \ + $dir/graph_tgpr_5k $data_fmllr/et05_real_$enhan $dir/decode_tgpr_5k_et05_real_$enhan & + steps/nnet/decode.sh --nj 4 --num-threads 3 --cmd "$decode_cmd" --acwt 0.10 --config conf/decode_dnn.config \ + $dir/graph_tgpr_5k $data_fmllr/et05_simu_$enhan $dir/decode_tgpr_5k_et05_simu_$enhan & + wait; +fi # Sequence training using sMBR criterion, we do Stochastic-GD # with per-utterance updates. We use usually good acwt 0.1 @@ -96,32 +130,38 @@ srcdir=exp/tri4a_dnn_tr05_multi_${enhan} acwt=0.1 # First we generate lattices and alignments: -# gawk musb be installed to perform awk -v FS="/" '{ print gensub(".gz","","",$NF)" gunzip -c "$0" |"; }' in +# gawk must be installed to perform awk -v FS="/" '{ print gensub(".gz","","",$NF)" gunzip -c "$0" |"; }' in # steps/nnet/make_denlats.sh -steps/nnet/align.sh --nj $nj --cmd "$train_cmd" \ - data-fbank/tr05_multi_${enhan} data/lang $srcdir ${srcdir}_ali -steps/nnet/make_denlats.sh --nj $nj --cmd "$decode_cmd" --config conf/decode_dnn.config --acwt $acwt \ - data-fbank/tr05_multi_${enhan} data/lang $srcdir ${srcdir}_denlats +if [ $stage -le 7 ]; then + steps/nnet/align.sh --nj $nj --cmd "$train_cmd" \ + $data_fmllr/tr05_multi_${enhan} data/lang $srcdir ${srcdir}_ali + steps/nnet/make_denlats.sh --nj $nj --cmd "$decode_cmd" --config conf/decode_dnn.config --acwt $acwt \ + $data_fmllr/tr05_multi_${enhan} data/lang $srcdir ${srcdir}_denlats +fi # Re-train the DNN by 1 iteration of sMBR -steps/nnet/train_mpe.sh --cmd "$cuda_cmd" --num-iters 1 --acwt $acwt --do-smbr true \ - data-fbank/tr05_multi_${enhan} data/lang $srcdir ${srcdir}_ali ${srcdir}_denlats $dir +if [ $stage -le 8 ]; then + steps/nnet/train_mpe.sh --cmd "$cuda_cmd" --num-iters 1 --acwt $acwt --do-smbr true \ + $data_fmllr/tr05_multi_${enhan} data/lang $srcdir ${srcdir}_ali ${srcdir}_denlats $dir +fi # Decode (reuse HCLG graph) -for ITER in 1; do - steps/nnet/decode.sh --nj 4 --num-threads 4 --cmd "$decode_cmd" --config conf/decode_dnn.config \ - --nnet $dir/${ITER}.nnet --acwt $acwt \ - exp/tri4a_dnn_tr05_multi_${enhan}/graph_tgpr_5k data-fbank/dt05_real_${enhan} $dir/decode_tgpr_5k_dt05_real_${enhan}_it${ITER} & - steps/nnet/decode.sh --nj 4 --num-threads 4 --cmd "$decode_cmd" --config conf/decode_dnn.config \ - --nnet $dir/${ITER}.nnet --acwt $acwt \ - exp/tri4a_dnn_tr05_multi_${enhan}/graph_tgpr_5k data-fbank/dt05_simu_${enhan} $dir/decode_tgpr_5k_dt05_simu_${enhan}_it${ITER} & - steps/nnet/decode.sh --nj 4 --num-threads 4 --cmd "$decode_cmd" --config conf/decode_dnn.config \ - --nnet $dir/${ITER}.nnet --acwt $acwt \ - exp/tri4a_dnn_tr05_multi_${enhan}/graph_tgpr_5k data-fbank/et05_real_${enhan} $dir/decode_tgpr_5k_et05_real_${enhan}_it${ITER} & - steps/nnet/decode.sh --nj 4 --num-threads 4 --cmd "$decode_cmd" --config conf/decode_dnn.config \ - --nnet $dir/${ITER}.nnet --acwt $acwt \ - exp/tri4a_dnn_tr05_multi_${enhan}/graph_tgpr_5k data-fbank/et05_simu_${enhan} $dir/decode_tgpr_5k_et05_simu_${enhan}_it${ITER} & -done +if [ $stage -le 9 ]; then + for ITER in 1; do + steps/nnet/decode.sh --nj 4 --num-threads 3 --cmd "$decode_cmd" --config conf/decode_dnn.config \ + --nnet $dir/${ITER}.nnet --acwt $acwt \ + exp/tri4a_dnn_tr05_multi_${enhan}/graph_tgpr_5k $data_fmllr/dt05_real_${enhan} $dir/decode_tgpr_5k_dt05_real_${enhan}_it${ITER} & + steps/nnet/decode.sh --nj 4 --num-threads 3 --cmd "$decode_cmd" --config conf/decode_dnn.config \ + --nnet $dir/${ITER}.nnet --acwt $acwt \ + exp/tri4a_dnn_tr05_multi_${enhan}/graph_tgpr_5k $data_fmllr/dt05_simu_${enhan} $dir/decode_tgpr_5k_dt05_simu_${enhan}_it${ITER} & + steps/nnet/decode.sh --nj 4 --num-threads 3 --cmd "$decode_cmd" --config conf/decode_dnn.config \ + --nnet $dir/${ITER}.nnet --acwt $acwt \ + exp/tri4a_dnn_tr05_multi_${enhan}/graph_tgpr_5k $data_fmllr/et05_real_${enhan} $dir/decode_tgpr_5k_et05_real_${enhan}_it${ITER} & + steps/nnet/decode.sh --nj 4 --num-threads 3 --cmd "$decode_cmd" --config conf/decode_dnn.config \ + --nnet $dir/${ITER}.nnet --acwt $acwt \ + exp/tri4a_dnn_tr05_multi_${enhan}/graph_tgpr_5k $data_fmllr/et05_simu_${enhan} $dir/decode_tgpr_5k_et05_simu_${enhan}_it${ITER} & + done +fi # Re-generate lattices, run 4 more sMBR iterations dir=exp/tri4a_dnn_tr05_multi_${enhan}_smbr_i1lats @@ -129,37 +169,47 @@ srcdir=exp/tri4a_dnn_tr05_multi_${enhan}_smbr acwt=0.1 # Generate lattices and alignments: -steps/nnet/align.sh --nj $nj --cmd "$train_cmd" \ - data-fbank/tr05_multi_${enhan} data/lang $srcdir ${srcdir}_ali -steps/nnet/make_denlats.sh --nj $nj --cmd "$decode_cmd" --config conf/decode_dnn.config --acwt $acwt \ - data-fbank/tr05_multi_${enhan} data/lang $srcdir ${srcdir}_denlats +if [ $stage -le 10 ]; then + steps/nnet/align.sh --nj $nj --cmd "$train_cmd" \ + $data_fmllr/tr05_multi_${enhan} data/lang $srcdir ${srcdir}_ali + steps/nnet/make_denlats.sh --nj $nj --cmd "$decode_cmd" --config conf/decode_dnn.config --acwt $acwt \ + $data_fmllr/tr05_multi_${enhan} data/lang $srcdir ${srcdir}_denlats +fi # Re-train the DNN by 4 iterations of sMBR -steps/nnet/train_mpe.sh --cmd "$cuda_cmd" --num-iters 4 --acwt $acwt --do-smbr true \ - data-fbank/tr05_multi_${enhan} data/lang $srcdir ${srcdir}_ali ${srcdir}_denlats $dir || exit 1 +if [ $stage -le 11 ]; then + steps/nnet/train_mpe.sh --cmd "$cuda_cmd" --num-iters 4 --acwt $acwt --do-smbr true \ + $data_fmllr/tr05_multi_${enhan} data/lang $srcdir ${srcdir}_ali ${srcdir}_denlats $dir || exit 1 +fi # Decode (reuse HCLG graph) -for ITER in 1 2 3 4; do - steps/nnet/decode.sh --nj 4 --num-threads 4 --cmd "$decode_cmd" --config conf/decode_dnn.config \ - --nnet $dir/${ITER}.nnet --acwt $acwt \ - exp/tri4a_dnn_tr05_multi_${enhan}/graph_tgpr_5k data-fbank/dt05_real_${enhan} $dir/decode_tgpr_5k_dt05_real_${enhan}_it${ITER} & - steps/nnet/decode.sh --nj 4 --num-threads 4 --cmd "$decode_cmd" --config conf/decode_dnn.config \ - --nnet $dir/${ITER}.nnet --acwt $acwt \ - exp/tri4a_dnn_tr05_multi_${enhan}/graph_tgpr_5k data-fbank/dt05_simu_${enhan} $dir/decode_tgpr_5k_dt05_simu_${enhan}_it${ITER} & - steps/nnet/decode.sh --nj 4 --num-threads 4 --cmd "$decode_cmd" --config conf/decode_dnn.config \ - --nnet $dir/${ITER}.nnet --acwt $acwt \ - exp/tri4a_dnn_tr05_multi_${enhan}/graph_tgpr_5k data-fbank/et05_real_${enhan} $dir/decode_tgpr_5k_et05_real_${enhan}_it${ITER} & - steps/nnet/decode.sh --nj 4 --num-threads 4 --cmd "$decode_cmd" --config conf/decode_dnn.config \ - --nnet $dir/${ITER}.nnet --acwt $acwt \ - exp/tri4a_dnn_tr05_multi_${enhan}/graph_tgpr_5k data-fbank/et05_simu_${enhan} $dir/decode_tgpr_5k_et05_simu_${enhan}_it${ITER} & -done -wait - -# decoded results of enhan speech using enhan DNN AMs -local/chime3_calc_wers.sh exp/tri4a_dnn_tr05_multi_$enhan $enhan > exp/tri4a_dnn_tr05_multi_$enhan/best_wer_$enhan.result -head -n 15 exp/tri4a_dnn_tr05_multi_$enhan/best_wer_$enhan.result -# decoded results of enhan speech using enhan DNN AMs with sequence training -./local/chime3_calc_wers_smbr.sh exp/tri4a_dnn_tr05_multi_${enhan}_smbr_i1lats ${enhan} exp/tri4a_dnn_tr05_multi_${enhan}/graph_tgpr_5k \ +if [ $stage -le 12 ]; then + for ITER in 1 2 3 4; do + steps/nnet/decode.sh --nj 4 --num-threads 3 --cmd "$decode_cmd" --config conf/decode_dnn.config \ + --nnet $dir/${ITER}.nnet --acwt $acwt \ + exp/tri4a_dnn_tr05_multi_${enhan}/graph_tgpr_5k $data_fmllr/dt05_real_${enhan} $dir/decode_tgpr_5k_dt05_real_${enhan}_it${ITER} & + steps/nnet/decode.sh --nj 4 --num-threads 3 --cmd "$decode_cmd" --config conf/decode_dnn.config \ + --nnet $dir/${ITER}.nnet --acwt $acwt \ + exp/tri4a_dnn_tr05_multi_${enhan}/graph_tgpr_5k $data_fmllr/dt05_simu_${enhan} $dir/decode_tgpr_5k_dt05_simu_${enhan}_it${ITER} & + steps/nnet/decode.sh --nj 4 --num-threads 3 --cmd "$decode_cmd" --config conf/decode_dnn.config \ + --nnet $dir/${ITER}.nnet --acwt $acwt \ + exp/tri4a_dnn_tr05_multi_${enhan}/graph_tgpr_5k $data_fmllr/et05_real_${enhan} $dir/decode_tgpr_5k_et05_real_${enhan}_it${ITER} & + steps/nnet/decode.sh --nj 4 --num-threads 3 --cmd "$decode_cmd" --config conf/decode_dnn.config \ + --nnet $dir/${ITER}.nnet --acwt $acwt \ + exp/tri4a_dnn_tr05_multi_${enhan}/graph_tgpr_5k $data_fmllr/et05_simu_${enhan} $dir/decode_tgpr_5k_et05_simu_${enhan}_it${ITER} & + done + wait +fi + +# scoring +if [ $stage -le 13 ]; then + # decoded results of enhanced speech using DNN AMs trained with enhanced data + local/chime3_calc_wers.sh exp/tri4a_dnn_tr05_multi_$enhan $enhan > exp/tri4a_dnn_tr05_multi_$enhan/best_wer_$enhan.result + head -n 15 exp/tri4a_dnn_tr05_multi_$enhan/best_wer_$enhan.result + # decoded results of enhanced speech using sequence-training DNN + ./local/chime3_calc_wers_smbr.sh exp/tri4a_dnn_tr05_multi_${enhan}_smbr_i1lats ${enhan} exp/tri4a_dnn_tr05_multi_${enhan}/graph_tgpr_5k \ > exp/tri4a_dnn_tr05_multi_${enhan}_smbr_i1lats/best_wer_${enhan}.result -head -n 15 exp/tri4a_dnn_tr05_multi_${enhan}_smbr_i1lats/best_wer_${enhan}.result + head -n 15 exp/tri4a_dnn_tr05_multi_${enhan}_smbr_i1lats/best_wer_${enhan}.result +fi +echo "`basename $0` Done." diff --git a/egs/chime3/s5/local/run_gmm.sh b/egs/chime3/s5/local/run_gmm.sh index 9ba4dadc14c..5b9fbaa1736 100755 --- a/egs/chime3/s5/local/run_gmm.sh +++ b/egs/chime3/s5/local/run_gmm.sh @@ -12,6 +12,12 @@ . ./cmd.sh ## You'll want to change cmd.sh to something that will work on your system. ## This relates to the queue. +# Config: +nj=30 +stage=0 # resume training with --stage=N + +. utils/parse_options.sh || exit 1; + # This is a shell script, but it's recommended that you run the commands one by # one by copying and pasting into the shell. @@ -22,87 +28,115 @@ if [ $# -ne 2 ]; then exit 1; fi -nj=30 - -# enhan data +# set enhanced data enhan=$1 enhan_data=$2 +# Set bash to 'debug' mode, it will exit on : +# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands', +set -e +set -u +set -o pipefail + # check whether run_init is executed if [ ! -d data/lang ]; then echo "error, execute local/run_init.sh, first" exit 1; fi -# process for enhan data -local/real_enhan_chime3_data_prep.sh $enhan $enhan_data || exit 1; -local/simu_enhan_chime3_data_prep.sh $enhan $enhan_data || exit 1; +# process for enhanced data +if [ $stage -le 0 ]; then + local/real_enhan_chime3_data_prep.sh $enhan $enhan_data + local/simu_enhan_chime3_data_prep.sh $enhan $enhan_data +fi # Now make MFCC features for clean, close, and noisy data # mfccdir should be some place with a largish disk where you # want to store MFCC features. mfccdir=mfcc/$enhan -for x in dt05_real_$enhan et05_real_$enhan tr05_real_$enhan dt05_simu_$enhan et05_simu_$enhan tr05_simu_$enhan; do - steps/make_mfcc.sh --nj $nj \ - data/$x exp/make_mfcc/$x $mfccdir || exit 1; - steps/compute_cmvn_stats.sh data/$x exp/make_mfcc/$x $mfccdir || exit 1; -done +if [ $stage -le 1 ]; then + for x in dt05_real_$enhan et05_real_$enhan tr05_real_$enhan dt05_simu_$enhan et05_simu_$enhan tr05_simu_$enhan; do + steps/make_mfcc.sh --nj 8 --cmd "$train_cmd" \ + data/$x exp/make_mfcc/$x $mfccdir + steps/compute_cmvn_stats.sh data/$x exp/make_mfcc/$x $mfccdir + done +fi -# make mixed training set from real and simulation enhancement training data +# make mixed training set from real and simulation enhanced data # multi = simu + real -utils/combine_data.sh data/tr05_multi_$enhan data/tr05_simu_$enhan data/tr05_real_$enhan -utils/combine_data.sh data/dt05_multi_$enhan data/dt05_simu_$enhan data/dt05_real_$enhan -utils/combine_data.sh data/et05_multi_$enhan data/et05_simu_$enhan data/et05_real_$enhan - -# decode enhan speech using clean AMs -steps/decode_fmllr.sh --nj 4 --num-threads 4 \ - exp/tri3b_tr05_orig_clean/graph_tgpr_5k data/dt05_real_$enhan exp/tri3b_tr05_orig_clean/decode_tgpr_5k_dt05_real_$enhan & -steps/decode_fmllr.sh --nj 4 --num-threads 4 \ - exp/tri3b_tr05_orig_clean/graph_tgpr_5k data/dt05_simu_$enhan exp/tri3b_tr05_orig_clean/decode_tgpr_5k_dt05_simu_$enhan & -steps/decode_fmllr.sh --nj 4 --num-threads 4 \ - exp/tri3b_tr05_orig_clean/graph_tgpr_5k data/et05_real_$enhan exp/tri3b_tr05_orig_clean/decode_tgpr_5k_et05_real_$enhan & -steps/decode_fmllr.sh --nj 4 --num-threads 4 \ - exp/tri3b_tr05_orig_clean/graph_tgpr_5k data/et05_simu_$enhan exp/tri3b_tr05_orig_clean/decode_tgpr_5k_et05_simu_$enhan & - -# training models using enhan data -steps/train_mono.sh --boost-silence 1.25 --nj $nj \ - data/tr05_multi_$enhan data/lang exp/mono0a_tr05_multi_$enhan || exit 1; - -steps/align_si.sh --boost-silence 1.25 --nj $nj \ - data/tr05_multi_$enhan data/lang exp/mono0a_tr05_multi_$enhan exp/mono0a_ali_tr05_multi_$enhan || exit 1; - -steps/train_deltas.sh --boost-silence 1.25 \ - 2000 10000 data/tr05_multi_$enhan data/lang exp/mono0a_ali_tr05_multi_$enhan exp/tri1_tr05_multi_$enhan || exit 1; - -steps/align_si.sh --nj $nj \ - data/tr05_multi_$enhan data/lang exp/tri1_tr05_multi_$enhan exp/tri1_ali_tr05_multi_$enhan || exit 1; - -steps/train_lda_mllt.sh \ - --splice-opts "--left-context=3 --right-context=3" \ - 2500 15000 data/tr05_multi_$enhan data/lang exp/tri1_ali_tr05_multi_$enhan exp/tri2b_tr05_multi_$enhan || exit 1; - -steps/align_si.sh --nj $nj \ - --use-graphs true data/tr05_multi_$enhan data/lang exp/tri2b_tr05_multi_$enhan exp/tri2b_ali_tr05_multi_$enhan || exit 1; - -steps/train_sat.sh \ - 2500 15000 data/tr05_multi_$enhan data/lang exp/tri2b_ali_tr05_multi_$enhan exp/tri3b_tr05_multi_$enhan || exit 1; - -utils/mkgraph.sh data/lang_test_tgpr_5k exp/tri3b_tr05_multi_$enhan exp/tri3b_tr05_multi_$enhan/graph_tgpr_5k || exit 1; - -# decode enhan speech using enhan AMs -steps/decode_fmllr.sh --nj 4 --num-threads 4 \ - exp/tri3b_tr05_multi_$enhan/graph_tgpr_5k data/dt05_real_$enhan exp/tri3b_tr05_multi_$enhan/decode_tgpr_5k_dt05_real_$enhan & -steps/decode_fmllr.sh --nj 4 --num-threads 4 \ - exp/tri3b_tr05_multi_$enhan/graph_tgpr_5k data/dt05_simu_$enhan exp/tri3b_tr05_multi_$enhan/decode_tgpr_5k_dt05_simu_$enhan & -steps/decode_fmllr.sh --nj 4 --num-threads 4 \ - exp/tri3b_tr05_multi_$enhan/graph_tgpr_5k data/et05_real_$enhan exp/tri3b_tr05_multi_$enhan/decode_tgpr_5k_et05_real_$enhan & -steps/decode_fmllr.sh --nj 4 --num-threads 4 \ - exp/tri3b_tr05_multi_$enhan/graph_tgpr_5k data/et05_simu_$enhan exp/tri3b_tr05_multi_$enhan/decode_tgpr_5k_et05_simu_$enhan & - -wait; -# decoded results of enhan speech using clean AMs -local/chime3_calc_wers.sh exp/tri3b_tr05_orig_clean $enhan > exp/tri3b_tr05_orig_clean/best_wer_$enhan.result -head -n 15 exp/tri3b_tr05_orig_clean/best_wer_$enhan.result -# decoded results of enhan speech using enhan AMs -local/chime3_calc_wers.sh exp/tri3b_tr05_multi_$enhan $enhan > exp/tri3b_tr05_multi_$enhan/best_wer_$enhan.result -head -n 15 exp/tri3b_tr05_multi_$enhan/best_wer_$enhan.result +if [ $stage -le 2 ]; then + utils/combine_data.sh data/tr05_multi_$enhan data/tr05_simu_$enhan data/tr05_real_$enhan + utils/combine_data.sh data/dt05_multi_$enhan data/dt05_simu_$enhan data/dt05_real_$enhan + utils/combine_data.sh data/et05_multi_$enhan data/et05_simu_$enhan data/et05_real_$enhan +fi + +# decode enhanced speech using clean AMs +if [ $stage -le 3 ]; then + steps/decode_fmllr.sh --nj 4 --num-threads 3 --cmd "$decode_cmd" \ + exp/tri3b_tr05_orig_clean/graph_tgpr_5k data/dt05_real_$enhan exp/tri3b_tr05_orig_clean/decode_tgpr_5k_dt05_real_$enhan & + steps/decode_fmllr.sh --nj 4 --num-threads 3 --cmd "$decode_cmd" \ + exp/tri3b_tr05_orig_clean/graph_tgpr_5k data/dt05_simu_$enhan exp/tri3b_tr05_orig_clean/decode_tgpr_5k_dt05_simu_$enhan & + steps/decode_fmllr.sh --nj 4 --num-threads 3 --cmd "$decode_cmd" \ + exp/tri3b_tr05_orig_clean/graph_tgpr_5k data/et05_real_$enhan exp/tri3b_tr05_orig_clean/decode_tgpr_5k_et05_real_$enhan & + steps/decode_fmllr.sh --nj 4 --num-threads 3 --cmd "$decode_cmd" \ + exp/tri3b_tr05_orig_clean/graph_tgpr_5k data/et05_simu_$enhan exp/tri3b_tr05_orig_clean/decode_tgpr_5k_et05_simu_$enhan & +fi + +# training models using enhanced data +# training monophone model +if [ $stage -le 4 ]; then + steps/train_mono.sh --boost-silence 1.25 --nj $nj --cmd "$train_cmd" \ + data/tr05_multi_$enhan data/lang exp/mono0a_tr05_multi_$enhan + + steps/align_si.sh --boost-silence 1.25 --nj $nj --cmd "$train_cmd" \ + data/tr05_multi_$enhan data/lang exp/mono0a_tr05_multi_$enhan exp/mono0a_ali_tr05_multi_$enhan +fi + +# training triphone model with delta, delta+delta features +if [ $stage -le 5 ]; then + steps/train_deltas.sh --boost-silence 1.25 --cmd "$train_cmd" \ + 2000 10000 data/tr05_multi_$enhan data/lang exp/mono0a_ali_tr05_multi_$enhan exp/tri1_tr05_multi_$enhan + steps/align_si.sh --nj $nj --cmd "$train_cmd" \ + data/tr05_multi_$enhan data/lang exp/tri1_tr05_multi_$enhan exp/tri1_ali_tr05_multi_$enhan +fi + +# training triphone model with lad mllt features +if [ $stage -le 6 ]; then + steps/train_lda_mllt.sh --cmd "$train_cmd" \ + --splice-opts "--left-context=3 --right-context=3" \ + 2500 15000 data/tr05_multi_$enhan data/lang exp/tri1_ali_tr05_multi_$enhan exp/tri2b_tr05_multi_$enhan + steps/align_si.sh --nj $nj --cmd "$train_cmd" \ + --use-graphs true data/tr05_multi_$enhan data/lang exp/tri2b_tr05_multi_$enhan exp/tri2b_ali_tr05_multi_$enhan +fi + +# training triphone model with SAT +if [ $stage -le 7 ]; then + steps/train_sat.sh --cmd "$train_cmd" \ + 2500 15000 data/tr05_multi_$enhan data/lang exp/tri2b_ali_tr05_multi_$enhan exp/tri3b_tr05_multi_$enhan + utils/mkgraph.sh data/lang_test_tgpr_5k exp/tri3b_tr05_multi_$enhan exp/tri3b_tr05_multi_$enhan/graph_tgpr_5k +fi + +# decode enhanced speech using AMs trained with enhanced data +if [ $stage -le 8 ]; then + steps/decode_fmllr.sh --nj 4 --num-threads 3 --cmd "$decode_cmd" \ + exp/tri3b_tr05_multi_$enhan/graph_tgpr_5k data/dt05_real_$enhan exp/tri3b_tr05_multi_$enhan/decode_tgpr_5k_dt05_real_$enhan & + steps/decode_fmllr.sh --nj 4 --num-threads 3 --cmd "$decode_cmd" \ + exp/tri3b_tr05_multi_$enhan/graph_tgpr_5k data/dt05_simu_$enhan exp/tri3b_tr05_multi_$enhan/decode_tgpr_5k_dt05_simu_$enhan & + steps/decode_fmllr.sh --nj 4 --num-threads 3 --cmd "$decode_cmd" \ + exp/tri3b_tr05_multi_$enhan/graph_tgpr_5k data/et05_real_$enhan exp/tri3b_tr05_multi_$enhan/decode_tgpr_5k_et05_real_$enhan & + steps/decode_fmllr.sh --nj 4 --num-threads 3 --cmd "$decode_cmd" \ + exp/tri3b_tr05_multi_$enhan/graph_tgpr_5k data/et05_simu_$enhan exp/tri3b_tr05_multi_$enhan/decode_tgpr_5k_et05_simu_$enhan & + wait; +fi + +# scoring +if [ $stage -le 9 ]; then + # decoded results of enhanced speech using clean AMs + local/chime3_calc_wers.sh exp/tri3b_tr05_orig_clean $enhan > exp/tri3b_tr05_orig_clean/best_wer_$enhan.result + head -n 15 exp/tri3b_tr05_orig_clean/best_wer_$enhan.result + # decoded results of enhanced speech using AMs trained with enhanced data + local/chime3_calc_wers.sh exp/tri3b_tr05_multi_$enhan $enhan > exp/tri3b_tr05_multi_$enhan/best_wer_$enhan.result + head -n 15 exp/tri3b_tr05_multi_$enhan/best_wer_$enhan.result +fi + +echo "`basename $0` Done." diff --git a/egs/chime3/s5/local/run_init.sh b/egs/chime3/s5/local/run_init.sh index 2f923298e38..9db289a12a5 100755 --- a/egs/chime3/s5/local/run_init.sh +++ b/egs/chime3/s5/local/run_init.sh @@ -5,6 +5,12 @@ # Mitsubishi Electric Research Labs (Shinji Watanabe) # Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +# Config: +nj=30 +stage=0 # resume training with --stage=N + +. utils/parse_options.sh || exit 1; + # This script is made from the kaldi recipe of the 2nd CHiME Challenge Track 2 # made by Chao Weng @@ -23,32 +29,38 @@ fi # This is a shell script, but it's recommended that you run the commands one by # one by copying and pasting into the shell. -nj=30 # clean data chime3_data=$1 wsj0_data=$chime3_data/data/WSJ0 # directory of WSJ0 in CHiME3. You can also specify your WSJ0 corpus directory eval_flag=true # make it true when the evaluation data are released -# process for clean speech and making LMs etc. from original WSJ0 -# note that training on clean data means original WSJ0 data only (no booth data) -local/clean_wsj0_data_prep.sh $wsj0_data || exit 1; - -local/wsj_prepare_dict.sh || exit 1; - -utils/prepare_lang.sh data/local/dict "" data/local/lang_tmp data/lang || exit 1; - -local/clean_chime3_format_data.sh || exit 1; +# Set bash to 'debug' mode, it will exit on : +# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands', +set -e +set -u +set -o pipefail + +if [ $stage -le 0 ]; then + # process for clean speech and making LMs etc. from original WSJ0 + # note that training on clean data means original WSJ0 data only (no booth data) + local/clean_wsj0_data_prep.sh $wsj0_data + local/wsj_prepare_dict.sh + utils/prepare_lang.sh data/local/dict "" data/local/lang_tmp data/lang + local/clean_chime3_format_data.sh +fi -# process for close talking speech for real data (will not be used) -local/real_close_chime3_data_prep.sh $chime3_data || exit 1; +if [ $stage -le 1 ]; then + # process for close talking speech for real data (will not be used) + # local/real_close_chime3_data_prep.sh $chime3_data -# process for booth recording speech (will not be used) -# local/bth_chime3_data_prep.sh $chime3_data || exit 1; + # process for booth recording speech (will not be used) + # local/bth_chime3_data_prep.sh $chime3_data -# process for distant talking speech for real and simulation data -local/real_noisy_chime3_data_prep.sh $chime3_data || exit 1; -local/simu_noisy_chime3_data_prep.sh $chime3_data || exit 1; + # process for distant talking speech for real and simulation data + local/real_noisy_chime3_data_prep.sh $chime3_data + local/simu_noisy_chime3_data_prep.sh $chime3_data +fi # Now make MFCC features for clean, close, and noisy data # mfccdir should be some place with a largish disk where you @@ -72,74 +84,88 @@ else list=$list" tr05_simu_noisy dt05_simu_noisy" fi mfccdir=mfcc -for x in $list; do - steps/make_mfcc.sh --nj 8 \ - data/$x exp/make_mfcc/$x $mfccdir || exit 1; - steps/compute_cmvn_stats.sh data/$x exp/make_mfcc/$x $mfccdir || exit 1; -done +if [ $stage -le 2 ]; then + for x in $list; do + steps/make_mfcc.sh --nj 8 --cmd "$train_cmd" \ + data/$x exp/make_mfcc/$x $mfccdir + steps/compute_cmvn_stats.sh data/$x exp/make_mfcc/$x $mfccdir + done +fi # make mixed training set from real and simulation training data # multi = simu + real -utils/combine_data.sh data/tr05_multi_noisy data/tr05_simu_noisy data/tr05_real_noisy -utils/combine_data.sh data/dt05_multi_noisy data/dt05_simu_noisy data/dt05_real_noisy +if [ $stage -le 3 ]; then + utils/combine_data.sh data/tr05_multi_noisy data/tr05_simu_noisy data/tr05_real_noisy + utils/combine_data.sh data/dt05_multi_noisy data/dt05_simu_noisy data/dt05_real_noisy +fi # training models for clean and noisy data # if you want to check the performance of the ASR only using real/simu data # please try to add "tr05_real_noisy" "tr05_simu_noisy" -#for train in tr05_multi_noisy tr05_real_noisy tr05_simu_noisy tr05_orig_clean; do -for train in tr05_multi_noisy tr05_orig_clean; do - nspk=`wc -l data/$train/spk2utt | awk '{print $1}'` - if [ $nj -gt $nspk ]; then - nj2=$nspk - else - nj2=$nj - fi - steps/train_mono.sh --boost-silence 1.25 --nj $nj2 \ - data/$train data/lang exp/mono0a_$train || exit 1; - - steps/align_si.sh --boost-silence 1.25 --nj $nj2 \ - data/$train data/lang exp/mono0a_$train exp/mono0a_ali_$train || exit 1; - - steps/train_deltas.sh --boost-silence 1.25 \ - 2000 10000 data/$train data/lang exp/mono0a_ali_$train exp/tri1_$train || exit 1; - - steps/align_si.sh --nj $nj2 \ - data/$train data/lang exp/tri1_$train exp/tri1_ali_$train || exit 1; - - steps/train_lda_mllt.sh \ - --splice-opts "--left-context=3 --right-context=3" \ - 2500 15000 data/$train data/lang exp/tri1_ali_$train exp/tri2b_$train || exit 1; - - steps/align_si.sh --nj $nj2 \ - --use-graphs true data/$train data/lang exp/tri2b_$train exp/tri2b_ali_$train || exit 1; - - steps/train_sat.sh \ - 2500 15000 data/$train data/lang exp/tri2b_ali_$train exp/tri3b_$train || exit 1; - - utils/mkgraph.sh data/lang_test_tgpr_5k exp/tri3b_$train exp/tri3b_$train/graph_tgpr_5k || exit 1; - - # if you want to know the result of the close talk microphone, plese try the following - # decode close speech - # steps/decode_fmllr.sh --nj 4 --num-threads 4 \ - # exp/tri3b_$train/graph_tgpr_5k data/dt05_real_close exp/tri3b_$train/decode_tgpr_5k_dt05_real_close & - # steps/decode_fmllr.sh --nj 4 --num-threads 4 \ - # exp/tri3b_$train/graph_tgpr_5k data/et05_real_close exp/tri3b_$train/decode_tgpr_5k_et05_real_close & - # decode real noisy speech - steps/decode_fmllr.sh --nj 4 --num-threads 4 \ - exp/tri3b_$train/graph_tgpr_5k data/dt05_real_noisy exp/tri3b_$train/decode_tgpr_5k_dt05_real_noisy & - steps/decode_fmllr.sh --nj 4 --num-threads 4 \ - exp/tri3b_$train/graph_tgpr_5k data/et05_real_noisy exp/tri3b_$train/decode_tgpr_5k_et05_real_noisy & - # decode simu noisy speech - steps/decode_fmllr.sh --nj 4 --num-threads 4 \ - exp/tri3b_$train/graph_tgpr_5k data/dt05_simu_noisy exp/tri3b_$train/decode_tgpr_5k_dt05_simu_noisy & - steps/decode_fmllr.sh --nj 4 --num-threads 4 \ - exp/tri3b_$train/graph_tgpr_5k data/et05_simu_noisy exp/tri3b_$train/decode_tgpr_5k_et05_simu_noisy & -done -wait +# for train in tr05_multi_noisy tr05_real_noisy tr05_simu_noisy tr05_orig_clean; do +if [ $stage -le 4 ]; then + for train in tr05_multi_noisy tr05_orig_clean; do + nspk=`wc -l data/$train/spk2utt | awk '{print $1}'` + if [ $nj -gt $nspk ]; then + nj2=$nspk + else + nj2=$nj + fi + # training monophone model + steps/train_mono.sh --boost-silence 1.25 --nj $nj2 --cmd "$train_cmd" \ + data/$train data/lang exp/mono0a_$train + steps/align_si.sh --boost-silence 1.25 --nj $nj2 --cmd "$train_cmd" \ + data/$train data/lang exp/mono0a_$train exp/mono0a_ali_$train + + # training triphone model with lad mllt features + steps/train_deltas.sh --boost-silence 1.25 --cmd "$train_cmd" \ + 2000 10000 data/$train data/lang exp/mono0a_ali_$train exp/tri1_$train + steps/align_si.sh --nj $nj2 --cmd "$train_cmd" \ + data/$train data/lang exp/tri1_$train exp/tri1_ali_$train + + steps/train_lda_mllt.sh --cmd "$train_cmd" \ + --splice-opts "--left-context=3 --right-context=3" \ + 2500 15000 data/$train data/lang exp/tri1_ali_$train exp/tri2b_$train + steps/align_si.sh --nj $nj2 --cmd "$train_cmd" \ + --use-graphs true data/$train data/lang exp/tri2b_$train exp/tri2b_ali_$train + + steps/train_sat.sh --cmd "$train_cmd" \ + 2500 15000 data/$train data/lang exp/tri2b_ali_$train exp/tri3b_$train + utils/mkgraph.sh data/lang_test_tgpr_5k exp/tri3b_$train exp/tri3b_$train/graph_tgpr_5k + done +fi + +# decoding +if [ $stage -le 5 ]; then + for train in tr05_multi_noisy tr05_orig_clean; do + # if you want to know the result of the close talk microphone, please try the following + # decode close speech + # steps/decode_fmllr.sh --nj 4 --num-threads 3 --cmd "$decode_cmd" \ + # exp/tri3b_$train/graph_tgpr_5k data/dt05_real_close exp/tri3b_$train/decode_tgpr_5k_dt05_real_close & + # steps/decode_fmllr.sh --nj 4 --num-threads 3 --cmd "$decode_cmd" \ + # exp/tri3b_$train/graph_tgpr_5k data/et05_real_close exp/tri3b_$train/decode_tgpr_5k_et05_real_close & + + # decode real noisy speech + steps/decode_fmllr.sh --nj 4 --num-threads 3 --cmd "$decode_cmd" \ + exp/tri3b_$train/graph_tgpr_5k data/dt05_real_noisy exp/tri3b_$train/decode_tgpr_5k_dt05_real_noisy & + steps/decode_fmllr.sh --nj 4 --num-threads 3 --cmd "$decode_cmd" \ + exp/tri3b_$train/graph_tgpr_5k data/et05_real_noisy exp/tri3b_$train/decode_tgpr_5k_et05_real_noisy & + # decode simu noisy speech + steps/decode_fmllr.sh --nj 4 --num-threads 3 --cmd "$decode_cmd" \ + exp/tri3b_$train/graph_tgpr_5k data/dt05_simu_noisy exp/tri3b_$train/decode_tgpr_5k_dt05_simu_noisy & + steps/decode_fmllr.sh --nj 4 --num-threads 3 --cmd "$decode_cmd" \ + exp/tri3b_$train/graph_tgpr_5k data/et05_simu_noisy exp/tri3b_$train/decode_tgpr_5k_et05_simu_noisy & + done + wait +fi # get the best scores -#for train in tr05_multi_noisy tr05_real_noisy tr05_simu_noisy tr05_orig_clean; do -for train in tr05_multi_noisy tr05_orig_clean; do - local/chime3_calc_wers.sh exp/tri3b_$train noisy > exp/tri3b_$train/best_wer_noisy.result - head -n 15 exp/tri3b_$train/best_wer_noisy.result -done +if [ $stage -le 6 ]; then + #for train in tr05_multi_noisy tr05_real_noisy tr05_simu_noisy tr05_orig_clean; do + for train in tr05_multi_noisy tr05_orig_clean; do + local/chime3_calc_wers.sh exp/tri3b_$train noisy > exp/tri3b_$train/best_wer_noisy.result + head -n 15 exp/tri3b_$train/best_wer_noisy.result + done +fi + +echo "`basename $0` Done." diff --git a/egs/chime3/s5/local/run_lmrescore.sh b/egs/chime3/s5/local/run_lmrescore.sh new file mode 100755 index 00000000000..0c364367c98 --- /dev/null +++ b/egs/chime3/s5/local/run_lmrescore.sh @@ -0,0 +1,125 @@ +#!/bin/bash + +# Copyright 2015 University of Sheffield (Jon Barker, Ricard Marxer) +# Inria (Emmanuel Vincent) +# Mitsubishi Electric Research Labs (Shinji Watanabe) +# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) + +# Copyright 2015, Mitsubishi Electric Research Laboratories, MERL (Author: Takaaki Hori) + +nj=12 +stage=1 +order=5 +hidden=300 +rnnweight=0.5 +nbest=100 + +. utils/parse_options.sh || exit 1; + +. ./path.sh +. ./cmd.sh ## You'll want to change cmd.sh to something that will work on your system. + ## This relates to the queue. + +# This is a shell script, but it's recommended that you run the commands one by +# one by copying and pasting into the shell. + +if [ $# -ne 2 ]; then + printf "\nUSAGE: %s \n\n" `basename $0` + echo "First argument specifies a root directory of CHiME3 data" + echo "Second argument specifies a unique name for different enhancement method" + exit 1; +fi + +# set language models +lm_suffix=${order}gkn_5k +rnnlm_suffix=rnnlm_5k_h${hidden} + +# data root +chime3_data=$1 +# enhan data +enhan=$2 + +# check data +if [ ! -d $chime3_data ]; then + echo "$chime3_data does not exist. Please specify chime3 data root correctly" && exit 1 +fi + +# check whether run_dnn is executed +srcdir=exp/tri4a_dnn_tr05_multi_${enhan}_smbr_i1lats +if [ ! -d $srcdir ]; then + echo "error, execute local/run_dnn.sh, first" + exit 1; +fi + +# train a high-order n-gram language model +if [ $stage -le 1 ]; then + local/chime3_train_lms.sh $chime3_data || exit 1; +fi + +# train a RNN language model +if [ $stage -le 2 ]; then + local/chime3_train_rnnlms.sh $chime3_data || exit 1; +fi + +# preparation +dir=exp/tri4a_dnn_tr05_multi_${enhan}_smbr_lmrescore +mkdir -p $dir +# make a symbolic link to graph info +if [ ! -e $dir/graph_tgpr_5k ]; then + if [ ! -e exp/tri4a_dnn_tr05_multi_${enhan}/graph_tgpr_5k ]; then + echo "graph is missing, execute local/run_dnn.sh, correctly" + exit 1; + fi + pushd . ; cd $dir + ln -s ../tri4a_dnn_tr05_multi_${enhan}/graph_tgpr_5k . + popd +fi + +# rescore lattices by a high-order N-gram +if [ $stage -le 3 ]; then + # check the best iteration + if [ ! -f $srcdir/log/best_wer_$enhan ]; then + echo "error, execute local/run_dnn.sh, first" + exit 1; + fi + it=`cut -f 1 -d" " $srcdir/log/best_wer_$enhan | awk -F'[_]' '{print $1}'` + # rescore lattices + for t in dt05_simu dt05_real et05_simu et05_real; do + steps/lmrescore.sh --mode 3 \ + data/lang_test_tgpr_5k \ + data/lang_test_${lm_suffix} \ + data-fmllr-tri3b/${t}_$enhan \ + $srcdir/decode_tgpr_5k_${t}_${enhan}_it$it \ + $dir/decode_tgpr_5k_${t}_${enhan}_${lm_suffix} + done + # rescored results by high-order n-gram LM + mkdir -p $dir/log + local/chime3_calc_wers.sh $dir ${enhan}_${lm_suffix} \ + > $dir/best_wer_${enhan}_${lm_suffix}.result + head -n 15 $dir/best_wer_${enhan}_${lm_suffix}.result +fi + +# N-best rescoring using a RNNLM +if [ $stage -le 4 ]; then + # check the best lmw + if [ ! -f $dir/log/best_wer_${enhan}_${lm_suffix} ]; then + echo "error, rescoring with a high-order n-gram seems to be failed" + exit 1; + fi + lmw=`cut -f 1 -d" " $dir/log/best_wer_${enhan}_${lm_suffix} | awk -F'[_]' '{print $NF}'` + # rescore n-best list for all sets + for t in dt05_simu dt05_real et05_simu et05_real; do + steps/rnnlmrescore.sh --inv-acwt $lmw --N $nbest --use-phi true \ + $rnnweight \ + data/lang_test_${lm_suffix} \ + data/lang_test_${rnnlm_suffix} \ + data-fmllr-tri3b/${t}_$enhan \ + $dir/decode_tgpr_5k_${t}_${enhan}_${lm_suffix} \ + $dir/decode_tgpr_5k_${t}_${enhan}_${rnnlm_suffix}_w${rnnweight}_n${nbest} + done + # calc wers for RNNLM results + local/chime3_calc_wers.sh $dir ${enhan}_${rnnlm_suffix}_w${rnnweight}_n${nbest} \ + > $dir/best_wer_${enhan}_${rnnlm_suffix}_w${rnnweight}_n${nbest}.result + head -n 15 $dir/best_wer_${enhan}_${rnnlm_suffix}_w${rnnweight}_n${nbest}.result +fi + diff --git a/egs/chime3/s5/path.sh b/egs/chime3/s5/path.sh index fc9eaf0192e..a4772b7d89d 100755 --- a/egs/chime3/s5/path.sh +++ b/egs/chime3/s5/path.sh @@ -1,4 +1,6 @@ export KALDI_ROOT=`pwd`/../../.. [ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh -export PATH=$PWD/utils/:$KALDI_ROOT/src/bin:$KALDI_ROOT/tools/openfst/bin:$KALDI_ROOT/src/fstbin/:$KALDI_ROOT/src/gmmbin/:$KALDI_ROOT/src/featbin/:$KALDI_ROOT/src/lm/:$KALDI_ROOT/src/sgmmbin/:$KALDI_ROOT/src/sgmm2bin/:$KALDI_ROOT/src/fgmmbin/:$KALDI_ROOT/src/latbin/:$KALDI_ROOT/src/nnetbin:$KALDI_ROOT/src/nnet2bin/:$KALDI_ROOT/src/kwsbin:$KALDI_ROOT/tools/irstlm/bin/:$KALDI_ROOT/tools/kaldi_lm/:$PWD:$PATH +export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$KALDI_ROOT/tools/irstlm/bin/:$KALDI_ROOT/tools/kaldi_lm/:$PWD:$PATH +[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1 +. $KALDI_ROOT/tools/config/common_path.sh export LC_ALL=C diff --git a/egs/chime3/s5/run.sh b/egs/chime3/s5/run.sh old mode 100644 new mode 100755 index a934055ab0b..f7cc389f37a --- a/egs/chime3/s5/run.sh +++ b/egs/chime3/s5/run.sh @@ -8,27 +8,67 @@ # Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) . ./path.sh -. ./cmd.sh +. ./cmd.sh + +# Config: +stage=0 # resume training with --stage=N + +. utils/parse_options.sh || exit 1; + +# Set bash to 'debug' mode, it will exit on : +# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands', +set -e +set -u +set -o pipefail # You can execute run_init.sh only "once" -# This creates LMs, basic task files, basic models, +# This creates LMs, basic task files, basic models, # baseline results without speech enhancement techniques, and so on. # Please set a main root directory of the CHiME3 data -# If you use kaldi scripts distributed in the CHiME3 data, -chime3_data=`pwd`/../.. -# Otherwise, please specify it, e.g., -# chime3_data=/local_data/watanabe/work/201410CHiME3/CHiME3 -local/run_init.sh $chime3_data +# If you use kaldi scripts distributed in the CHiME3 data, +# chime3_data=`pwd`/../.. +# Otherwise, please specify it, e.g., +chime3_data=/data2/archive/speech-db/original/public/CHiME3 +if [ ! -d $chime3_data ]; then + echo "$chime3_data does not exist. Please specify chime3 data root correctly" && exit 1 +fi +if [ $stage -le 0 ]; then + local/run_init.sh $chime3_data +fi + +# Using Beamformit +# This results in better performance than the CHiME3 official beamforming +# See Hori et al, "The MERL/SRI system for the 3rd CHiME challenge using beamforming, +# robust feature extraction, and advanced speech recognition," in Proc. ASRU'15 +# note that beamformed wav files are generated in the following directory +enhancement_method=beamformit_5mics +enhancement_data=`pwd`/$enhancement_method +if [ $stage -le 1 ]; then + local/chime3_beamform.sh --cmd "$train_cmd" --nj 20 $chime3_data/data/audio/16kHz/isolated $enhancement_data +fi # GMM based ASR experiment # Please set a directory of your speech enhancement method. # run_gmm.sh can be done every time when you change a speech enhancement technique. # The directory structure and audio files must follow the attached baseline enhancement directory -enhancement_method=enhanced -enhancement_data=$chime3_data/data/audio/16kHz/enhanced -local/run_gmm.sh $enhancement_method $enhancement_data +# if you want to use the CHiME3 official enhanced data, please comment out the following +# enhancement_method=enhanced +# enhancement_data=$chime3_data/data/audio/16kHz/enhanced +if [ $stage -le 2 ]; then + local/run_gmm.sh $enhancement_method $enhancement_data +fi # DNN based ASR experiment # Since it takes time to evaluate DNN, we make the GMM and DNN scripts separately. # You may execute it after you would have promising results using GMM-based ASR experiments -local/run_dnn.sh $enhancement_method $enhancement_data \ No newline at end of file +if [ $stage -le 3 ]; then + local/run_dnn.sh $enhancement_method $enhancement_data +fi + +# LM-rescoring experiment with 5-gram and RNN LMs +# It takes a few days to train a RNNLM. +if [ $stage -le 4 ]; then + local/run_lmrescore.sh $chime3_data $enhancement_method +fi + +echo "Done." diff --git a/egs/chime4/README.txt b/egs/chime4/README.txt new file mode 100644 index 00000000000..e1d8f35e3f6 --- /dev/null +++ b/egs/chime4/README.txt @@ -0,0 +1,11 @@ +This is a kaldi recipe for the 4th CHiME Speech Separation and Recognition Challenge (CHiME-4). The +challenge revisits the datasets originally recorded for CHiME-3, i.e., Wall Street Journal corpus sentences +spoken by talkers situated in challenging noisy environments recorded using a 6-channel tablet based +microphone array. CHiME-4 increases the level of difficulty by constraining the number of microphones +available for testing (i.e., separate 1, 2 and 6 channel tracks). + +See http://spandh.dcs.shef.ac.uk/chime_challenge/ for more detailed information. + + s5_1ch: 1 channel track + s5_2ch: 2 channel track + s5_6ch: 6 channel track diff --git a/egs/chime4/s5_1ch/RESULTS b/egs/chime4/s5_1ch/RESULTS new file mode 100644 index 00000000000..5654b6400ef --- /dev/null +++ b/egs/chime4/s5_1ch/RESULTS @@ -0,0 +1,47 @@ +# CHiME-4 1ch track results +# The result is based on Hori et al, "The MERL/SRI system for the 3rd CHiME challenge using beamforming, +# robust feature extraction, and advanced speech recognition," in Proc. ASRU'15, +# and please refer the paper if you think the baseline useful. +# Note that the following result is different from that in the paper since we don't include +# SRI's robust features and system combination + +GMM noisy multi-condition without enhancement +exp/tri3b_tr05_multi_noisy/best_wer_isolated_1ch_track.result +------------------- +dt05_simu WER: 24.48% (Average), 20.37% (BUS), 29.78% (CAFE), 20.49% (PEDESTRIAN), 27.27% (STREET) +------------------- +dt05_real WER: 22.16% (Average), 27.32% (BUS), 23.07% (CAFE), 16.29% (PEDESTRIAN), 21.96% (STREET) +------------------- + +DNN sMBR +exp/tri4a_dnn_tr05_multi_noisy_smbr_i1lats/best_wer_isolated_1ch_track.result +------------------- +best overall dt05 WER 15.17% (language model weight = 11) + (Number of iterations = 4) +------------------- +dt05_simu WER: 15.67% (Average), 14.09% (BUS), 18.97% (CAFE), 12.76% (PEDESTRIAN), 16.89% (STREET) +------------------- +dt05_real WER: 14.67% (Average), 18.97% (BUS), 15.28% (CAFE), 9.88% (PEDESTRIAN), 14.56% (STREET) +------------------- + +5-gram rescoring +exp/tri4a_dnn_tr05_multi_noisy_smbr_lmrescore/best_wer_isolated_1ch_track_5gkn_5k.result +------------------- +best overall dt05 WER 13.46% (language model weight = 11) +------------------- +dt05_simu WER: 13.99% (Average), 13.02% (BUS), 16.76% (CAFE), 11.12% (PEDESTRIAN), 15.07% (STREET) +------------------- +dt05_real WER: 12.93% (Average), 16.89% (BUS), 13.48% (CAFE), 8.53% (PEDESTRIAN), 12.82% (STREET) +------------------- + +RNNLM +exp/tri4a_dnn_tr05_multi_noisy_smbr_lmrescore/best_wer_isolated_1ch_track_rnnlm_5k_h300_w0.5_n100.result +------------------- +best overall dt05 WER 12.28% (language model weight = 11) +------------------- +dt05_simu WER: 12.98% (Average), 11.90% (BUS), 15.90% (CAFE), 9.94% (PEDESTRIAN), 14.19% (STREET) +------------------- +dt05_real WER: 11.57% (Average), 15.13% (BUS), 11.81% (CAFE), 7.42% (PEDESTRIAN), 11.90% (STREET) +------------------- + + diff --git a/egs/chime4/s5_1ch/cmd.sh b/egs/chime4/s5_1ch/cmd.sh new file mode 100755 index 00000000000..114fbff7a17 --- /dev/null +++ b/egs/chime4/s5_1ch/cmd.sh @@ -0,0 +1,22 @@ +# you can change cmd.sh depending on what type of queue you are using. +# If you have no queueing system and want to run on a local machine, you +# can change all instances 'queue.pl' to run.pl (but be careful and run +# commands one by one: most recipes will exhaust the memory on your +# machine). queue.pl works with GridEngine (qsub). slurm.pl works +# with slurm. Different queues are configured differently, with different +# queue names and different ways of specifying things like memory; +# to account for these differences you can create and edit the file +# conf/queue.conf to match your queue's configuration. Search for +# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, +# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. + +#export train_cmd="queue.pl --mem 2G" +#export cuda_cmd="queue.pl --mem 2G --gpu 1" +#export decode_cmd="queue.pl --mem 4G" +#export mkgraph_cmd="queue.pl --mem 8G" + +# run it locally... +export train_cmd=run.pl +export decode_cmd=run.pl +export cuda_cmd=run.pl +export mkgraph_cmd=run.pl diff --git a/egs/chime4/s5_1ch/conf/chime4.cfg b/egs/chime4/s5_1ch/conf/chime4.cfg new file mode 100755 index 00000000000..70fdd858651 --- /dev/null +++ b/egs/chime4/s5_1ch/conf/chime4.cfg @@ -0,0 +1,50 @@ +#BeamformIt sample configuration file for AMI data (http://groups.inf.ed.ac.uk/ami/download/) + +# scrolling size to compute the delays +scroll_size = 250 + +# cross correlation computation window size +window_size = 500 + +#amount of maximum points for the xcorrelation taken into account +nbest_amount = 4 + +#flag wether to apply an automatic noise thresholding +do_noise_threshold = 1 + +#Percentage of frames with lower xcorr taken as noisy +noise_percent = 10 + +######## acoustic modelling parameters + +#transition probabilities weight for multichannel decoding +trans_weight_multi = 25 +trans_weight_nbest = 25 + +### + +#flag wether to print the feaures after setting them, or not +print_features = 1 + +#flag wether to use the bad frames in the sum process +do_avoid_bad_frames = 1 + +#flag to use the best channel (SNR) as a reference +#defined from command line +do_compute_reference = 1 + +#flag wether to use a uem file or not(process all the file) +do_use_uem_file = 0 + +#flag wether to use an adaptative weights scheme or fixed weights +do_adapt_weights = 1 + +#flag wether to output the sph files or just run the system to create the auxiliary files +do_write_sph_files = 1 + +####directories where to store/retrieve info#### +#channels_file = ./cfg-files/channels + +#show needs to be passed as argument normally, here a default one is given just in case +#show_id = Ttmp + diff --git a/egs/chime4/s5_1ch/conf/decode_dnn.config b/egs/chime4/s5_1ch/conf/decode_dnn.config new file mode 100644 index 00000000000..89dd9929a62 --- /dev/null +++ b/egs/chime4/s5_1ch/conf/decode_dnn.config @@ -0,0 +1,2 @@ +beam=18.0 # beam for decoding. Was 13.0 in the scripts. +lattice_beam=10.0 # this has most effect on size of the lattices. diff --git a/egs/chime4/s5_1ch/conf/fbank.conf b/egs/chime4/s5_1ch/conf/fbank.conf new file mode 100644 index 00000000000..5fc7774b31f --- /dev/null +++ b/egs/chime4/s5_1ch/conf/fbank.conf @@ -0,0 +1,11 @@ +# No non-default options for now. +--window-type=hamming # disable Dans window, use the standard +--use-energy=false # only fbank outputs +--sample-frequency=16000 # Cantonese is sampled at 8kHz + +--low-freq=64 # typical setup from Frantisek Grezl +--high-freq=8000 +--dither=1 + +--num-mel-bins=40 # 8kHz so we use 15 bins +--htk-compat=true # try to make it compatible with HTK diff --git a/egs/timit/s3/conf/mfcc.conf b/egs/chime4/s5_1ch/conf/mfcc.conf similarity index 100% rename from egs/timit/s3/conf/mfcc.conf rename to egs/chime4/s5_1ch/conf/mfcc.conf diff --git a/egs/chime4/s5_1ch/local/chime4_calc_wers.sh b/egs/chime4/s5_1ch/local/chime4_calc_wers.sh new file mode 100755 index 00000000000..079668520f4 --- /dev/null +++ b/egs/chime4/s5_1ch/local/chime4_calc_wers.sh @@ -0,0 +1,85 @@ +#!/bin/bash + +# Copyright 2015 Mitsubishi Electric Research Laboratories (Author: Shinji Watanabe) +# Apache 2.0. + +set -e + +# Config: +eval_flag=false # make it true when the evaluation data are released + +. utils/parse_options.sh || exit 1; + +if [ $# -ne 3 ]; then + printf "\nUSAGE: %s \n\n" `basename $0` + printf "%s exp/tri3b_tr05_sr_noisy noisy exp/tri4a_dnn_tr05_sr_noisy/graph_tgpr_5k\n\n" `basename $0` + exit 1; +fi + +echo "$0 $@" # Print the command line for logging + +. path.sh + +dir=$1 +enhan=$2 +graph_dir=$3 + +echo "compute dt05 WER for each location" +echo "" +mkdir -p $dir/log +for a in `find $dir/decode_tgpr_5k_dt05_real_$enhan/ | grep "\/wer_" | awk -F'[/]' '{print $NF}' | sort`; do + echo -n "$a " + if [ -e $dir/decode_tgpr_5k_dt05_simu_$enhan ]; then + cat $dir/decode_tgpr_5k_dt05_{real,simu}_$enhan/$a | grep WER | awk '{err+=$4} {wrd+=$6} END{printf("%.2f\n",err/wrd*100)}' + else + cat $dir/decode_tgpr_5k_dt05_real_$enhan/$a | grep WER | awk '{err+=$4} {wrd+=$6} END{printf("%.2f\n",err/wrd*100)}' + fi +done | sort -n -k 2 | head -n 1 > $dir/log/best_wer_$enhan + +lmw=`cut -f 1 -d" " $dir/log/best_wer_$enhan | cut -f 2 -d"_"` +echo "-------------------" +printf "best overall dt05 WER %s" `cut -f 2 -d" " $dir/log/best_wer_$enhan` +echo -n "%" +printf " (language model weight = %s)\n" $lmw +echo "-------------------" +if $eval_flag; then + tasks="dt05 et05" +else + tasks="dt05" +fi +for e_d in $tasks; do + for task in simu real; do + rdir=$dir/decode_tgpr_5k_${e_d}_${task}_$enhan + if [ -e $rdir ]; then + for a in _BUS _CAF _PED _STR; do + grep $a $rdir/scoring/test_filt.txt \ + > $rdir/scoring/test_filt_$a.txt + cat $rdir/scoring/$lmw.tra \ + | utils/int2sym.pl -f 2- $graph_dir/words.txt \ + | sed s:\::g \ + | compute-wer --text --mode=present ark:$rdir/scoring/test_filt_$a.txt ark,p:- \ + 1> $rdir/${a}_wer_$lmw 2> /dev/null + done + echo -n "${e_d}_${task} WER: `grep WER $rdir/wer_$lmw | cut -f 2 -d" "`% (Average), " + echo -n "`grep WER $rdir/_BUS_wer_$lmw | cut -f 2 -d" "`% (BUS), " + echo -n "`grep WER $rdir/_CAF_wer_$lmw | cut -f 2 -d" "`% (CAFE), " + echo -n "`grep WER $rdir/_PED_wer_$lmw | cut -f 2 -d" "`% (PEDESTRIAN), " + echo -n "`grep WER $rdir/_STR_wer_$lmw | cut -f 2 -d" "`% (STREET)" + echo "" + echo "-------------------" + fi + done +done +echo "" + +for e_d in $tasks; do + echo "-----------------------------" + echo "1-best transcription for $e_d" + echo "-----------------------------" + for task in simu real; do + rdir=$dir/decode_tgpr_5k_${e_d}_${task}_$enhan + cat $rdir/scoring/$lmw.tra \ + | utils/int2sym.pl -f 2- $graph_dir/words.txt \ + | sed s:\::g + done +done diff --git a/egs/chime4/s5_1ch/local/chime4_calc_wers_smbr.sh b/egs/chime4/s5_1ch/local/chime4_calc_wers_smbr.sh new file mode 100755 index 00000000000..4990423a8a7 --- /dev/null +++ b/egs/chime4/s5_1ch/local/chime4_calc_wers_smbr.sh @@ -0,0 +1,83 @@ +#!/bin/bash + +# Copyright 2015 Mitsubishi Electric Research Laboratories (Author: Shinji Watanabe) +# Apache 2.0. + +set -e + +# Config: +eval_flag=false # make it true when the evaluation data are released + +. utils/parse_options.sh || exit 1; + +if [ $# -ne 3 ]; then + printf "\nUSAGE: %s \n\n" `basename $0` + printf "%s exp/tri3b_tr05_sr_noisy noisy exp/tri4a_dnn_tr05_sr_noisy/graph_tgpr_5k\n\n" `basename $0` + exit 1; +fi + +echo "$0 $@" # Print the command line for logging + +. path.sh + +dir=$1 +enhan=$2 +graph_dir=$3 + +echo "compute WER for each location" +echo "" +mkdir -p $dir/log +# collect scores +for x in `find $dir/ -type d -name "*_it*" | awk -F "_it" '{print $NF}' | sort | uniq`; do + for y in `find $dir/*_${enhan}_it*/ | grep "\/wer_" | awk -F'[/]' '{print $NF}' | sort | uniq`; do + echo -n "${x}_$y " + cat $dir/decode_tgpr_5k_dt05_{real,simu}_${enhan}_it$x/$y | grep WER | awk '{err+=$4} {wrd+=$6} END{printf("%.2f\n",err/wrd*100)}' + done +done | sort -n -k 2 | head -n 1 > $dir/log/best_wer_$enhan + +lmw=`cut -f 1 -d" " $dir/log/best_wer_$enhan | awk -F'[_]' '{print $NF}'` +it=`cut -f 1 -d" " $dir/log/best_wer_$enhan | awk -F'[_]' '{print $1}'` +echo "-------------------" +printf "best overall dt05 WER %s" `cut -f 2 -d" " $dir/log/best_wer_$enhan` +echo -n "%" +printf " (language model weight = %s)\n" $lmw +printf " (Number of iterations = %s)\n" $it +echo "-------------------" +if $eval_flag; then + tasks="dt05 et05" +else + tasks="dt05" +fi +for e_d in $tasks; do + for task in simu real; do + rdir=$dir/decode_tgpr_5k_${e_d}_${task}_${enhan}_it$it + for a in _BUS _CAF _PED _STR; do + grep $a $rdir/scoring/test_filt.txt \ + > $rdir/scoring/test_filt_$a.txt + cat $rdir/scoring/$lmw.tra \ + | utils/int2sym.pl -f 2- $graph_dir/words.txt \ + | sed s:\::g \ + | compute-wer --text --mode=present ark:$rdir/scoring/test_filt_$a.txt ark,p:- \ + 1> $rdir/${a}_wer_$lmw 2> /dev/null + done + echo -n "${e_d}_${task} WER: `grep WER $rdir/wer_$lmw | cut -f 2 -d" "`% (Average), " + echo -n "`grep WER $rdir/_BUS_wer_$lmw | cut -f 2 -d" "`% (BUS), " + echo -n "`grep WER $rdir/_CAF_wer_$lmw | cut -f 2 -d" "`% (CAFE), " + echo -n "`grep WER $rdir/_PED_wer_$lmw | cut -f 2 -d" "`% (PEDESTRIAN), " + echo -n "`grep WER $rdir/_STR_wer_$lmw | cut -f 2 -d" "`% (STREET)" + echo "" + echo "-------------------" + done +done + +for e_d in $tasks; do + echo "-----------------------------" + echo "1-best transcription for $e_d" + echo "-----------------------------" + for task in simu real; do + rdir=$dir/decode_tgpr_5k_${e_d}_${task}_${enhan}_it$it + cat $rdir/scoring/$lmw.tra \ + | utils/int2sym.pl -f 2- $graph_dir/words.txt \ + | sed s:\::g + done +done diff --git a/egs/chime4/s5_1ch/local/chime4_train_lms.sh b/egs/chime4/s5_1ch/local/chime4_train_lms.sh new file mode 100755 index 00000000000..06dd716e789 --- /dev/null +++ b/egs/chime4/s5_1ch/local/chime4_train_lms.sh @@ -0,0 +1,117 @@ +#!/bin/bash + +# Modified from the script for CHiME3 baseline +# Copyright 2015, Mitsubishi Electric Research Laboratories, MERL (Author: Takaaki Hori) + +# Config: +order=5 # n-gram order + +. utils/parse_options.sh || exit 1; + +. ./path.sh + +if [ $# -ne 1 ]; then + printf "\nUSAGE: %s \n\n" `basename $0` + echo "Please specifies a Chime4 root directory" + echo "If you use kaldi scripts distributed in the Chime4 data," + echo "It would be `pwd`/../.." + exit 1; +fi + +# check data directories +chime4_data=$1 +wsj0_data=$chime4_data/data/WSJ0 # directory of WSJ0 in Chime4. You can also specify your WSJ0 corpus directory +if [ ! -d $chime4_data ]; then + echo "$chime4_data does not exist. Please specify chime4 data root correctly" && exit 1 +fi +if [ ! -d $wsj0_data ]; then + echo "$wsj0_data does not exist. Please specify WSJ0 corpus directory" && exit 1 +fi +lm_train=$wsj0_data/wsj0/doc/lng_modl/lm_train/np_data + +# check whether run_init is executed +if [ ! -d data/lang ]; then + echo "error, execute local/run_init.sh, first" + exit 1; +fi + +# lm directories +dir=data/local/local_lm +srcdir=data/local/nist_lm +mkdir -p $dir + +# check srilm ngram +! which ngram-count \ + && echo "SRILM tools not installed, which are required for LM training" && exit 1; + +# extract 5k vocabulary from a baseline language model +srclm=$srcdir/lm_tgpr_5k.arpa.gz +if [ -f $srclm ]; then + echo "Getting vocabulary from a baseline language model"; + ngram -lm $srclm -unk -map-unk '' -write-vocab $dir/vocab_5k.txt +else + echo "Language model $srclm does not exist" && exit 1; +fi + +# collect training data from WSJ0 +touch $dir/train.gz +if [ `du -m $dir/train.gz | cut -f 1` -eq 63 ]; then + echo "Not getting training data again [already exists]"; +else + echo "Collecting training data from $lm_train"; + gunzip -c $lm_train/{87,88,89}/*.z \ + | awk -v voc=$dir/vocab_5k.txt ' + BEGIN{ while((getline0) { invoc[$1]=1; }} + /^ "); } + } + printf("\n"); + }' | gzip -c > $dir/train.gz +fi + +# get validation data from Chime4 dev set +touch $dir/valid.gz +if [ `du -k $dir/valid.gz | cut -f 1` -eq 68 ]; then + echo "Not getting validation data again [already exists]"; +else + echo "Collecting validation data from $chime4_data/data/transcriptions"; + cut -d" " -f2- $chime4_data/data/transcriptions/dt05_real.trn_all \ + $chime4_data/data/transcriptions/dt05_simu.trn_all \ + |gzip -c > $dir/valid.gz +fi + +# train a large n-gram language model +lm_suffix=${order}gkn_5k +if [ -f $dir/lm_${lm_suffix}.arpa.gz ]; then + echo "A $order-gram language model aready exists and is not constructed again" + echo "To reconstruct, remove $dir/lm_${lm_suffix}.arpa.gz first" +else + echo "Training a $order-gram language model" + ngram-count -text $dir/train.gz -order $order \ + -vocab $dir/vocab_5k.txt -unk -map-unk "" \ + -gt2min 1 -gt3min 1 -gt4min 2 -gt5min 2 \ + -interpolate -kndiscount \ + -lm $dir/lm_${lm_suffix}.arpa.gz +fi +echo "Checking validation perplexity of $order-gram language model" +ngram -order $order -ppl $dir/valid.gz -lm $dir/lm_${lm_suffix}.arpa.gz +# e.g. 5-gram perplexity: +# file data/local/local_lm/valid.txt: 3280 sentences, 54239 words, 3 OOVs +# 0 zeroprobs, logprob= -96775.5 ppl= 48.1486 ppl1= 60.8611 + +# convert arpa LM to G.fst +echo "Converting the $order-gram language model to G.fst" +test=data/lang_test_${lm_suffix} +mkdir -p $test +cp -r data/lang/* $test || exit 1; + +gunzip -c $dir/lm_${lm_suffix}.arpa.gz | \ + arpa2fst --disambig-symbol=#0 \ + --read-symbol-table=$test/words.txt - $test/G.fst + +utils/validate_lang.pl --skip-determinization-check $test || exit 1; + +echo "Succeeded in $order-gram LM training and conversion to G.fst" + diff --git a/egs/chime4/s5_1ch/local/chime4_train_rnnlms.sh b/egs/chime4/s5_1ch/local/chime4_train_rnnlms.sh new file mode 100755 index 00000000000..8324c8e06b1 --- /dev/null +++ b/egs/chime4/s5_1ch/local/chime4_train_rnnlms.sh @@ -0,0 +1,111 @@ +#!/bin/bash + +# Copyright 2015, Mitsubishi Electric Research Laboratories, MERL (Author: Takaaki Hori) + +# Config: +hidden=300 # Num-hidden units +class=200 # Num-classes +rnnlm_ver=rnnlm-0.3e # version of RNNLM to use +threads=1 # for RNNLM-HS +bptt=4 # length of BPTT unfolding in RNNLM +bptt_block=10 # length of BPTT unfolding in RNNLM + +. utils/parse_options.sh || exit 1; + +. ./path.sh +. ./cmd.sh ## You'll want to change cmd.sh to something that will work on your system. + ## This relates to the queue. + +if [ $# -ne 1 ]; then + printf "\nUSAGE: %s \n\n" `basename $0` + echo "Please specifies a Chime4 root directory" + echo "If you use kaldi scripts distributed in the Chime4 data," + echo "It would be `pwd`/../.." + exit 1; +fi + +# check data directories +chime4_data=$1 +wsj0_data=$chime4_data/data/WSJ0 # directory of WSJ0 in Chime4. You can also specify your WSJ0 corpus directory +if [ ! -d $chime4_data ]; then + echo "$chime4_data does not exist. Please specify chime4 data root correctly" && exit 1 +fi +if [ ! -d $wsj0_data ]; then + echo "$wsj0_data does not exist. Please specify WSJ0 corpus directory" && exit 1 +fi +lm_train=$wsj0_data/wsj0/doc/lng_modl/lm_train/np_data + +# lm directories +dir=data/local/local_lm +srcdir=data/local/nist_lm +mkdir -p $dir + +# extract 5k vocabulary from a baseline language model +srclm=$srcdir/lm_tgpr_5k.arpa.gz +if [ -f $srclm ]; then + echo "Getting vocabulary from a baseline language model"; + gunzip -c $srclm | awk 'BEGIN{unig=0}{ + if(unig==0){ + if($1=="\\1-grams:"){unig=1}} + else { + if ($1 != "") { + if ($1=="\\2-grams:" || $1=="\\end\\") {exit} + else {print $2}} + }}' | sed "s///" > $dir/vocab_5k.rnn +else + echo "Language model $srclm does not exist" && exit 1; +fi + +# collect training data from WSJ0 +touch $dir/train.rnn +if [ `du -m $dir/train.rnn | cut -f 1` -eq 223 ]; then + echo "Not getting training data again [already exists]"; +else + echo "Collecting training data from $lm_train"; + gunzip -c $lm_train/{87,88,89}/*.z \ + | awk -v voc=$dir/vocab_5k.rnn ' + BEGIN{ while((getline0) { invoc[$1]=1; }} + /^ "); } + } + printf("\n"); + }' > $dir/train.rnn +fi + +# get validation data from Chime4 dev set +touch $dir/valid.rnn +if [ `cat $dir/valid.rnn | wc -w` -eq 54239 ]; then + echo "Not getting validation data again [already exists]"; +else + echo "Collecting validation data from $chime4_data/data/transcriptions"; + cut -d" " -f2- $chime4_data/data/transcriptions/dt05_real.trn_all \ + $chime4_data/data/transcriptions/dt05_simu.trn_all \ + > $dir/valid.rnn +fi + +# RNN language model traing +$KALDI_ROOT/tools/extras/check_for_rnnlm.sh "$rnnlm_ver" || exit 1 + +# train a RNN language model +rnnmodel=$dir/rnnlm_5k_h${hidden}_bptt${bptt} +if [ -f $rnnmodel ]; then + echo "A RNN language model aready exists and is not constructed again" + echo "To reconstruct, remove $rnnmodel first" +else + echo "Training a RNN language model with $rnnlm_ver" + echo "(runtime log is written to $dir/rnnlm.log)" + $train_cmd $dir/rnnlm.log \ + $KALDI_ROOT/tools/$rnnlm_ver/rnnlm -train $dir/train.rnn -valid $dir/valid.rnn \ + -rnnlm $rnnmodel -hidden $hidden -class $class \ + -rand-seed 1 -independent -debug 1 -bptt $bptt -bptt-block $bptt_block || exit 1; +fi + +# store in a RNNLM directory with necessary files +rnndir=data/lang_test_rnnlm_5k_h${hidden} +mkdir -p $rnndir +cp $rnnmodel $rnndir/rnnlm +grep -v -e "" -e "" $dir/vocab_5k.rnn > $rnndir/wordlist.rnn +touch $rnndir/unk.probs # make an empty file because we don't know unk-word probs. + diff --git a/egs/chime4/s5_1ch/local/clean_chime4_format_data.sh b/egs/chime4/s5_1ch/local/clean_chime4_format_data.sh new file mode 100755 index 00000000000..23dc8a70d9e --- /dev/null +++ b/egs/chime4/s5_1ch/local/clean_chime4_format_data.sh @@ -0,0 +1,56 @@ +#!/bin/bash + +# Copyright 2012 Microsoft Corporation Johns Hopkins University (Author: Daniel Povey) +# 2015 Guoguo Chen +# 2016 Mitsubishi Electric Research Laboratories (Author: Shinji Watanabe) +# Apache 2.0 + +# This script takes data prepared in a corpus-dependent way +# in data/local/, and converts it into the "canonical" form, +# in various subdirectories of data/, e.g. data/lang, data/lang_test_ug, +# data/train_si84, etc. + +lang_suffix= + +echo "$0 $@" # Print the command line for logging +. utils/parse_options.sh || exit 1; + +. ./path.sh || exit 1; + +echo "Preparing train and test data" +srcdir=data/local/data +lmdir=data/local/nist_lm +tmpdir=data/local/lm_tmp +lexicon=data/local/lang_tmp/lexiconp.txt +mkdir -p $tmpdir + +for x in et05_orig_clean dt05_orig_clean tr05_orig_clean; do + mkdir -p data/$x + cp $srcdir/${x}_wav.scp data/$x/wav.scp || exit 1; + cp $srcdir/$x.txt data/$x/text || exit 1; + cp $srcdir/$x.spk2utt data/$x/spk2utt || exit 1; + cp $srcdir/$x.utt2spk data/$x/utt2spk || exit 1; + utils/filter_scp.pl data/$x/spk2utt $srcdir/spk2gender > data/$x/spk2gender || exit 1; +done + + +# Next, for each type of language model, create the corresponding FST +# and the corresponding lang_test_* directory. + +echo Preparing language models for test + +for lm_suffix in tgpr_5k; do + test=data/lang${lang_suffix}_test_${lm_suffix} + + mkdir -p $test + cp -r data/lang${lang_suffix}/* $test || exit 1; + + gunzip -c $lmdir/lm_${lm_suffix}.arpa.gz | \ + arpa2fst --disambig-symbol=#0 \ + --read-symbol-table=$test/words.txt - $test/G.fst + + utils/validate_lang.pl --skip-determinization-check $test || exit 1; +done + +echo "Succeeded in formatting data." +rm -r $tmpdir diff --git a/egs/chime4/s5_1ch/local/clean_wsj0_data_prep.sh b/egs/chime4/s5_1ch/local/clean_wsj0_data_prep.sh new file mode 100755 index 00000000000..8c6989bc0b2 --- /dev/null +++ b/egs/chime4/s5_1ch/local/clean_wsj0_data_prep.sh @@ -0,0 +1,152 @@ +#!/bin/bash +set -e + +# Copyright 2009-2012 Microsoft Corporation Johns Hopkins University (Author: Daniel Povey) +# Apache 2.0. + +# This is modified from the script in standard Kaldi recipe to account +# for the way the WSJ data is structured on the Edinburgh systems. +# - Arnab Ghoshal, 29/05/12 + +# Modified from the script for CHiME2 baseline +# Shinji Watanabe 02/13/2015 + +if [ $# -ne 1 ]; then + printf "\nUSAGE: %s \n\n" `basename $0` + echo "The argument should be a the top-level WSJ corpus directory." + echo "It is assumed that there will be a 'wsj0' and a 'wsj1' subdirectory" + echo "within the top-level corpus directory." + exit 1; +fi + +wsj0=$1 + +dir=`pwd`/data/local/data +lmdir=`pwd`/data/local/nist_lm +mkdir -p $dir $lmdir +local=`pwd`/local +utils=`pwd`/utils + +. ./path.sh # Needed for KALDI_ROOT +sph2pipe=$KALDI_ROOT/tools/sph2pipe_v2.5/sph2pipe +if [ ! -x $sph2pipe ]; then + echo "Could not find (or execute) the sph2pipe program at $sph2pipe"; + exit 1; +fi + +if [ -z $IRSTLM ] ; then + export IRSTLM=$KALDI_ROOT/tools/irstlm/ +fi +export PATH=${PATH}:$IRSTLM/bin +if ! command -v prune-lm >/dev/null 2>&1 ; then + echo "$0: Error: the IRSTLM is not available or compiled" >&2 + echo "$0: Error: We used to install it by default, but." >&2 + echo "$0: Error: this is no longer the case." >&2 + echo "$0: Error: To install it, go to $KALDI_ROOT/tools" >&2 + echo "$0: Error: and run extras/install_irstlm.sh" >&2 + exit 1 +fi + +cd $dir + +# This version for SI-84 +cat $wsj0/wsj0/doc/indices/train/tr_s_wv1.ndx \ + | $local/cstr_ndx2flist.pl $wsj0 | sort -u > tr05_orig_clean.flist + +# Now for the test sets. +# $wsj0/wsj1/doc/indices/readme.doc +# describes all the different test sets. +# Note: each test-set seems to come in multiple versions depending +# on different vocabulary sizes, verbalized vs. non-verbalized +# pronunciations, etc. We use the largest vocab and non-verbalized +# pronunciations. +# The most normal one seems to be the "baseline 60k test set", which +# is h1_p0. + +# Nov'92 (330 utts, 5k vocab) +cat $wsj0/wsj0/doc/indices/test/nvp/si_et_05.ndx | \ + $local/cstr_ndx2flist.pl $wsj0 | sort > et05_orig_clean.flist + +# Note: the ???'s below match WSJ and SI_DT, or wsj and si_dt. +# Sometimes this gets copied from the CD's with upcasing, don't know +# why (could be older versions of the disks). +find $wsj0/wsj0/si_dt_05 -print | grep -i ".wv1" | sort > dt05_orig_clean.flist + +# Finding the transcript files: +find -L $wsj0 -iname '*.dot' > dot_files.flist + +# Convert the transcripts into our format (no normalization yet) +# adding suffix to utt_id +# 0 for clean condition +for x in tr05_orig_clean et05_orig_clean dt05_orig_clean; do + $local/flist2scp.pl $x.flist | sort > ${x}_sph_tmp.scp + cat ${x}_sph_tmp.scp | awk '{print $1}' \ + | $local/find_transcripts.pl dot_files.flist > ${x}_tmp.trans1 + cat ${x}_sph_tmp.scp | awk '{printf("%s %s\n", $1, $2);}' > ${x}_sph.scp + cat ${x}_tmp.trans1 | awk '{printf("%s ", $1); for(i=2;i<=NF;i++) printf("%s ", $i); printf("\n");}' > ${x}.trans1 +done + +# Do some basic normalization steps. At this point we don't remove OOVs-- +# that will be done inside the training scripts, as we'd like to make the +# data-preparation stage independent of the specific lexicon used. +noiseword=""; +for x in tr05_orig_clean et05_orig_clean dt05_orig_clean; do + cat $x.trans1 | $local/normalize_transcript.pl $noiseword \ + | sort > $x.txt || exit 1; +done + +# Create scp's with wav's. (the wv1 in the distribution is not really wav, it is sph.) +for x in tr05_orig_clean et05_orig_clean dt05_orig_clean; do + awk '{printf("%s '$sph2pipe' -f wav %s |\n", $1, $2);}' < ${x}_sph.scp \ + > ${x}_wav.scp +done + +# Make the utt2spk and spk2utt files. +for x in tr05_orig_clean et05_orig_clean dt05_orig_clean; do + cat ${x}_sph.scp | awk '{print $1}' \ + | perl -ane 'chop; m:^...:; print "$_ $&\n";' > $x.utt2spk + cat $x.utt2spk | $utils/utt2spk_to_spk2utt.pl > $x.spk2utt || exit 1; +done + +#in case we want to limit lm's on most frequent words, copy lm training word frequency list +cp $wsj0/wsj0/doc/lng_modl/vocab/wfl_64.lst $lmdir +chmod u+w $lmdir/*.lst # had weird permissions on source. + +# The 5K vocab language model without verbalized pronunciations. +# This is used for 3rd CHiME challenge +# trigram would be: !only closed vocabulary here! +cp $wsj0/wsj0/doc/lng_modl/base_lm/tcb05cnp.z $lmdir/lm_tg_5k.arpa.gz || exit 1; +chmod u+rw $lmdir/lm_tg_5k.arpa.gz +gunzip $lmdir/lm_tg_5k.arpa.gz +tail -n 4328839 $lmdir/lm_tg_5k.arpa | gzip -c -f > $lmdir/lm_tg_5k.arpa.gz +rm $lmdir/lm_tg_5k.arpa + +prune-lm --threshold=1e-7 $lmdir/lm_tg_5k.arpa.gz $lmdir/lm_tgpr_5k.arpa || exit 1; +gzip -f $lmdir/lm_tgpr_5k.arpa || exit 1; + + +if [ ! -f wsj0-train-spkrinfo.txt ] || [ `cat wsj0-train-spkrinfo.txt | wc -l` -ne 134 ]; then + rm -f wsj0-train-spkrinfo.txt + wget http://www.ldc.upenn.edu/Catalog/docs/LDC93S6A/wsj0-train-spkrinfo.txt \ + || ( echo "Getting wsj0-train-spkrinfo.txt from backup location" && \ + wget --no-check-certificate https://sourceforge.net/projects/kaldi/files/wsj0-train-spkrinfo.txt ); +fi + +if [ ! -f wsj0-train-spkrinfo.txt ]; then + echo "Could not get the spkrinfo.txt file from LDC website (moved)?" + echo "This is possibly omitted from the training disks; couldn't find it." + echo "Everything else may have worked; we just may be missing gender info" + echo "which is only needed for VTLN-related diagnostics anyway." + exit 1 +fi +# Note: wsj0-train-spkrinfo.txt doesn't seem to be on the disks but the +# LDC put it on the web. Perhaps it was accidentally omitted from the +# disks. + +cat $wsj0/wsj0/doc/spkrinfo.txt \ + ./wsj0-train-spkrinfo.txt | \ + perl -ane 'tr/A-Z/a-z/; m/^;/ || print;' | \ + awk '{print $1, $2}' | grep -v -- -- | sort | uniq > spk2gender + + +echo "Data preparation succeeded" diff --git a/egs/chime4/s5_1ch/local/cstr_ndx2flist.pl b/egs/chime4/s5_1ch/local/cstr_ndx2flist.pl new file mode 100755 index 00000000000..d19db421a9f --- /dev/null +++ b/egs/chime4/s5_1ch/local/cstr_ndx2flist.pl @@ -0,0 +1,54 @@ +#!/usr/bin/env perl + +# Copyright 2010-2011 Microsoft Corporation + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +# MERCHANTABLITY OR NON-INFRINGEMENT. +# See the Apache 2 License for the specific language governing permissions and +# limitations under the License. + +# This is modified from the script in standard Kaldi recipe to account +# for the way the WSJ data is structured on the Edinburgh systems. +# - Arnab Ghoshal, 12/1/12 + +# This program takes as its standard input an .ndx file from the WSJ corpus that looks +# like this: +#;; File: tr_s_wv1.ndx, updated 04/26/94 +#;; +#;; Index for WSJ0 SI-short Sennheiser training data +#;; Data is read WSJ sentences, Sennheiser mic. +#;; Contains 84 speakers X (~100 utts per speaker MIT/SRI and ~50 utts +#;; per speaker TI) = 7236 utts +#;; +#11_1_1:wsj0/si_tr_s/01i/01ic0201.wv1 +#11_1_1:wsj0/si_tr_s/01i/01ic0202.wv1 +#11_1_1:wsj0/si_tr_s/01i/01ic0203.wv1 + +# and as command-line argument it takes the names of the WSJ disk locations, e.g.: +# /group/corpora/public/wsjcam0/data on DICE machines. +# It outputs a list of absolute pathnames. + +$wsj_dir = $ARGV[0]; + +while(){ + if(m/^;/){ next; } # Comment. Ignore it. + else { + m/^([0-9_]+):\s*(\S+)$/ || die "Could not parse line $_"; + $filename = $2; # as a subdirectory of the distributed disk. + if ($filename !~ m/\.wv1$/) { $filename .= ".wv1"; } + $filename = "$wsj_dir/$filename"; + if (-e $filename) { + print "$filename\n"; + } else { + print STDERR "File $filename found in the index but not on disk\n"; + } + } +} diff --git a/egs/chime4/s5_1ch/local/find_noisy_transcripts.pl b/egs/chime4/s5_1ch/local/find_noisy_transcripts.pl new file mode 100755 index 00000000000..fdeb38d9444 --- /dev/null +++ b/egs/chime4/s5_1ch/local/find_noisy_transcripts.pl @@ -0,0 +1,65 @@ +#!/usr/bin/env perl +# Copyright 2010-2011 Microsoft Corporation + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +# MERCHANTABLITY OR NON-INFRINGEMENT. +# See the Apache 2 License for the specific language governing permissions and +# limitations under the License. + + + +# This program takes on its standard input a list of utterance +# id's, one for each line. (e.g. 4k0c030a is a an utterance id). +# It takes as +# Extracts from the dot files the transcripts for a given +# dataset (represented by a file list). +# + +@ARGV == 1 || die "find_transcripts.pl dot_files_flist < utterance_ids > transcripts"; +$dot_flist = shift @ARGV; + +open(L, "<$dot_flist") || die "Opening file list of dot files: $dot_flist\n"; +while(){ + chop; + m:\S+/(\w{6})00.dot: || die "Bad line in dot file list: $_"; + $spk = $1; + $spk2dot{$spk} = $_; +} + + + +while(){ + chop; + $uttid_orig = $_; + $uttid = substr $uttid_orig, 0, 8; + $uttid =~ m:(\w{6})\w\w: || die "Bad utterance id $_"; + $spk = $1; + if($spk ne $curspk) { + %utt2trans = { }; # Don't keep all the transcripts in memory... + $curspk = $spk; + $dotfile = $spk2dot{$spk}; + defined $dotfile || die "No dot file for speaker $spk\n"; + open(F, "<$dotfile") || die "Error opening dot file $dotfile\n"; + while() { + $_ =~ m:(.+)\((\w{8})\)\s*$: || die "Bad line $_ in dot file $dotfile (line $.)\n"; + $trans = $1; + $utt = $2; + $utt2trans{$utt} = $trans; + } + } + if(!defined $utt2trans{$uttid}) { + print STDERR "No transcript for utterance $uttid (current dot file is $dotfile)\n"; + } else { + print "$uttid_orig $utt2trans{$uttid}\n"; + } +} + + diff --git a/egs/chime4/s5_1ch/local/find_transcripts.pl b/egs/chime4/s5_1ch/local/find_transcripts.pl new file mode 100755 index 00000000000..6429411b864 --- /dev/null +++ b/egs/chime4/s5_1ch/local/find_transcripts.pl @@ -0,0 +1,64 @@ +#!/usr/bin/env perl +# Copyright 2010-2011 Microsoft Corporation + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +# MERCHANTABLITY OR NON-INFRINGEMENT. +# See the Apache 2 License for the specific language governing permissions and +# limitations under the License. + + + +# This program takes on its standard input a list of utterance +# id's, one for each line. (e.g. 4k0c030a is a an utterance id). +# It takes as +# Extracts from the dot files the transcripts for a given +# dataset (represented by a file list). +# + +@ARGV == 1 || die "find_transcripts.pl dot_files_flist < utterance_ids > transcripts"; +$dot_flist = shift @ARGV; + +open(L, "<$dot_flist") || die "Opening file list of dot files: $dot_flist\n"; +while(){ + chop; + m:\S+/(\w{6})00.dot: || die "Bad line in dot file list: $_"; + $spk = $1; + $spk2dot{$spk} = $_; +} + + + +while(){ + chop; + $uttid = $_; + $uttid =~ m:(\w{6})\w\w: || die "Bad utterance id $_"; + $spk = $1; + if($spk ne $curspk) { + %utt2trans = { }; # Don't keep all the transcripts in memory... + $curspk = $spk; + $dotfile = $spk2dot{$spk}; + defined $dotfile || die "No dot file for speaker $spk\n"; + open(F, "<$dotfile") || die "Error opening dot file $dotfile\n"; + while() { + $_ =~ m:(.+)\((\w{8})\)\s*$: || die "Bad line $_ in dot file $dotfile (line $.)\n"; + $trans = $1; + $utt = $2; + $utt2trans{$utt} = $trans; + } + } + if(!defined $utt2trans{$uttid}) { + print STDERR "No transcript for utterance $uttid (current dot file is $dotfile)\n"; + } else { + print "$uttid $utt2trans{$uttid}\n"; + } +} + + diff --git a/egs/timit/s4/utils/eps2disambig.pl b/egs/chime4/s5_1ch/local/flist2scp.pl similarity index 57% rename from egs/timit/s4/utils/eps2disambig.pl rename to egs/chime4/s5_1ch/local/flist2scp.pl index 049802b0888..234e4add1ed 100755 --- a/egs/timit/s4/utils/eps2disambig.pl +++ b/egs/chime4/s5_1ch/local/flist2scp.pl @@ -14,10 +14,18 @@ # See the Apache 2 License for the specific language governing permissions and # limitations under the License. -# This script replaces epsilon with #0 on the input side only, of the G.fst -# acceptor. + +# takes in a file list with lines like +# /mnt/matylda2/data/WSJ1/13-16.1/wsj1/si_dt_20/4k0/4k0c030a.wv1 +# and outputs an scp in kaldi format with lines like +# 4k0c030a /mnt/matylda2/data/WSJ1/13-16.1/wsj1/si_dt_20/4k0/4k0c030a.wv1 +# (the first thing is the utterance-id, which is the same as the basename of the file. + while(<>){ - s:^(\d+\s+\d+\s+)\(\s+):$1#0$2:; - print; + m:^\S+/(\w+)\.[wW][vV]1$: || die "Bad line $_"; + $id = $1; + $id =~ tr/A-Z/a-z/; # Necessary because of weirdness on disk 13-16.1 (uppercase filenames) + print "$id $_"; } + diff --git a/egs/chime4/s5_1ch/local/normalize_transcript.pl b/egs/chime4/s5_1ch/local/normalize_transcript.pl new file mode 100755 index 00000000000..09cee06172e --- /dev/null +++ b/egs/chime4/s5_1ch/local/normalize_transcript.pl @@ -0,0 +1,59 @@ +#!/usr/bin/env perl +# Copyright 2010-2011 Microsoft Corporation + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +# MERCHANTABLITY OR NON-INFRINGEMENT. +# See the Apache 2 License for the specific language governing permissions and +# limitations under the License. + + +# This takes data from the standard input that's unnormalized transcripts in the format +# 4k2c0308 Of course there isn\'t any guarantee the company will keep its hot hand [misc_noise] +# 4k2c030a [loud_breath] And new hardware such as the set of personal computers I\. B\. M\. introduced last week can lead to unexpected changes in the software business [door_slam] +# and outputs normalized transcripts. +# c.f. /mnt/matylda2/data/WSJ0/11-10.1/wsj0/transcrp/doc/dot_spec.doc + +@ARGV == 1 || die "usage: normalize_transcript.pl noise_word < transcript > transcript2"; +$noise_word = shift @ARGV; + +while() { + $_ =~ m:^(\S+) (.+): || die "bad line $_"; + $utt = $1; + $trans = $2; + print "$utt"; + foreach $w (split (" ",$trans)) { + $w =~ tr:a-z:A-Z:; # Upcase everything to match the CMU dictionary. . + $w =~ s:\\::g; # Remove backslashes. We don't need the quoting. + $w =~ s:^\%PERCENT$:PERCENT:; # Normalization for Nov'93 test transcripts. + $w =~ s:^\.POINT$:POINT:; # Normalization for Nov'93 test transcripts. + if($w =~ m:^\[\<\w+\]$: || # E.g. [\]$: || # E.g. [door_slam>], this means a door slammed in the next word. Delete. + $w =~ m:\[\w+/\]$: || # E.g. [phone_ring/], which indicates the start of this phenomenon. + $w =~ m:\[\/\w+]$: || # E.g. [/phone_ring], which indicates the end of this phenomenon. + $w eq "~" || # This is used to indicate truncation of an utterance. Not a word. + $w eq ".") { # "." is used to indicate a pause. Silence is optional anyway so not much + # point including this in the transcript. + next; # we won't print this word. + } elsif($w =~ m:\[\w+\]:) { # Other noises, e.g. [loud_breath]. + print " $noise_word"; + } elsif($w =~ m:^\<([\w\']+)\>$:) { + # e.g. replace with and. (the <> means verbal deletion of a word).. but it's pronounced. + print " $1"; + } elsif($w eq "--DASH") { + print " -DASH"; # This is a common issue; the CMU dictionary has it as -DASH. +# } elsif($w =~ m:(.+)\-DASH$:) { # E.g. INCORPORATED-DASH... seems the DASH gets combined with previous word +# print " $1 -DASH"; + } else { + print " $w"; + } + } + print "\n"; +} diff --git a/egs/chime4/s5_1ch/local/real_enhan_chime4_data_prep.sh b/egs/chime4/s5_1ch/local/real_enhan_chime4_data_prep.sh new file mode 100755 index 00000000000..b5ff06f6903 --- /dev/null +++ b/egs/chime4/s5_1ch/local/real_enhan_chime4_data_prep.sh @@ -0,0 +1,103 @@ +#!/bin/bash +set -e + +# Copyright 2009-2012 Microsoft Corporation Johns Hopkins University (Author: Daniel Povey) +# Apache 2.0. + +# This is modified from the script in standard Kaldi recipe to account +# for the way the WSJ data is structured on the Edinburgh systems. +# - Arnab Ghoshal, 29/05/12 + +# Modified from the script for CHiME2 baseline +# Shinji Watanabe 02/13/2015 + +# Config: +eval_flag=false # make it true when the evaluation data are released + +. utils/parse_options.sh || exit 1; + +if [ $# -ne 2 ]; then + printf "\nUSAGE: %s \n\n" `basename $0` + echo "The argument should be a the directory that only contains enhanced speech data." + exit 1; +fi + +echo "$0 $@" # Print the command line for logging + +enhan=$1 +audio_dir=$2 + +dir=`pwd`/data/local/data +mkdir -p $dir +local=`pwd`/local +utils=`pwd`/utils +odir=`pwd`/data + +. ./path.sh # Needed for KALDI_ROOT +export PATH=$PATH:$KALDI_ROOT/tools/irstlm/bin +sph2pipe=$KALDI_ROOT/tools/sph2pipe_v2.5/sph2pipe +if [ ! -x $sph2pipe ]; then + echo "Could not find (or execute) the sph2pipe program at $sph2pipe"; + exit 1; +fi + +if $eval_flag; then +list_set="tr05_real_$enhan dt05_real_$enhan et05_real_$enhan" +else +list_set="tr05_real_$enhan dt05_real_$enhan" +fi + +cd $dir + +find $audio_dir/ -name '*.wav' | grep 'tr05_bus_real\|tr05_caf_real\|tr05_ped_real\|tr05_str_real' | sort -u > tr05_real_$enhan.flist +find $audio_dir/ -name '*.wav' | grep 'dt05_bus_real\|dt05_caf_real\|dt05_ped_real\|dt05_str_real' | sort -u > dt05_real_$enhan.flist +if $eval_flag; then +find $audio_dir/ -name '*.wav' | grep 'et05_bus_real\|et05_caf_real\|et05_ped_real\|et05_str_real' | sort -u > et05_real_$enhan.flist +fi + +# make a scp file from file list +for x in $list_set; do + cat $x.flist | awk -F'[/]' '{print $NF}'| sed -e 's/\.wav/_REAL/' > ${x}_wav.ids + paste -d" " ${x}_wav.ids $x.flist | sort -k 1 > ${x}_wav.scp +done + +#make a transcription from dot +cat tr05_real.dot | sed -e 's/(\(.*\))/\1/' | awk '{print $NF "_REAL"}'> tr05_real_$enhan.ids +cat tr05_real.dot | sed -e 's/(.*)//' > tr05_real_$enhan.txt +paste -d" " tr05_real_$enhan.ids tr05_real_$enhan.txt | sort -k 1 > tr05_real_$enhan.trans1 +cat dt05_real.dot | sed -e 's/(\(.*\))/\1/' | awk '{print $NF "_REAL"}'> dt05_real_$enhan.ids +cat dt05_real.dot | sed -e 's/(.*)//' > dt05_real_$enhan.txt +paste -d" " dt05_real_$enhan.ids dt05_real_$enhan.txt | sort -k 1 > dt05_real_$enhan.trans1 +if $eval_flag; then +cat et05_real.dot | sed -e 's/(\(.*\))/\1/' | awk '{print $NF "_REAL"}'> et05_real_$enhan.ids +cat et05_real.dot | sed -e 's/(.*)//' > et05_real_$enhan.txt +paste -d" " et05_real_$enhan.ids et05_real_$enhan.txt | sort -k 1 > et05_real_$enhan.trans1 +fi + +# Do some basic normalization steps. At this point we don't remove OOVs-- +# that will be done inside the training scripts, as we'd like to make the +# data-preparation stage independent of the specific lexicon used. +noiseword=""; +for x in $list_set;do + cat $x.trans1 | $local/normalize_transcript.pl $noiseword \ + | sort > $x.txt || exit 1; +done + +# Make the utt2spk and spk2utt files. +for x in $list_set; do + cat ${x}_wav.scp | awk -F'_' '{print $1}' > $x.spk + cat ${x}_wav.scp | awk '{print $1}' > $x.utt + paste -d" " $x.utt $x.spk > $x.utt2spk + cat $x.utt2spk | $utils/utt2spk_to_spk2utt.pl > $x.spk2utt || exit 1; +done + +# copying data to data/... +for x in $list_set; do + mkdir -p $odir/$x + cp ${x}_wav.scp $odir/$x/wav.scp || exit 1; + cp ${x}.txt $odir/$x/text || exit 1; + cp ${x}.spk2utt $odir/$x/spk2utt || exit 1; + cp ${x}.utt2spk $odir/$x/utt2spk || exit 1; +done + +echo "Data preparation succeeded" diff --git a/egs/chime4/s5_1ch/local/real_noisy_chime4_data_prep.sh b/egs/chime4/s5_1ch/local/real_noisy_chime4_data_prep.sh new file mode 100755 index 00000000000..86186b9e543 --- /dev/null +++ b/egs/chime4/s5_1ch/local/real_noisy_chime4_data_prep.sh @@ -0,0 +1,114 @@ +#!/bin/bash +set -e + +# Copyright 2009-2012 Microsoft Corporation Johns Hopkins University (Author: Daniel Povey) +# Apache 2.0. + +# This is modified from the script in standard Kaldi recipe to account +# for the way the WSJ data is structured on the Edinburgh systems. +# - Arnab Ghoshal, 29/05/12 + +# Modified from the script for CHiME2 baseline +# Shinji Watanabe 02/13/2015 + +# Config: +eval_flag=false # make it true when the evaluation data are released + +. utils/parse_options.sh || exit 1; + +if [ $# -ne 1 ]; then + printf "\nUSAGE: %s \n\n" `basename $0` + echo "The argument should be a the top-level Chime4 directory." + echo "It is assumed that there will be a 'data' subdirectory" + echo "within the top-level corpus directory." + exit 1; +fi + +echo "$0 $@" # Print the command line for logging + +audio_dir=$1/data/audio/16kHz/isolated +trans_dir=$1/data/transcriptions + +echo "extract 5th channel (CH5.wav, the center bottom edge in the front of the tablet) for noisy data" + +dir=`pwd`/data/local/data +lmdir=`pwd`/data/local/nist_lm +mkdir -p $dir $lmdir +local=`pwd`/local +utils=`pwd`/utils + +. ./path.sh # Needed for KALDI_ROOT +export PATH=$PATH:$KALDI_ROOT/tools/irstlm/bin +sph2pipe=$KALDI_ROOT/tools/sph2pipe_v2.5/sph2pipe +if [ ! -x $sph2pipe ]; then + echo "Could not find (or execute) the sph2pipe program at $sph2pipe"; + exit 1; +fi + +if $eval_flag; then +list_set="tr05_real_noisy dt05_real_noisy et05_real_noisy" +else +list_set="tr05_real_noisy dt05_real_noisy" +fi + +cd $dir + +find $audio_dir -name '*CH5.wav' | grep 'tr05_bus_real\|tr05_caf_real\|tr05_ped_real\|tr05_str_real' | sort -u > tr05_real_noisy.flist +find $audio_dir -name '*CH5.wav' | grep 'dt05_bus_real\|dt05_caf_real\|dt05_ped_real\|dt05_str_real' | sort -u > dt05_real_noisy.flist +if $eval_flag; then +find $audio_dir -name '*CH5.wav' | grep 'et05_bus_real\|et05_caf_real\|et05_ped_real\|et05_str_real' | sort -u > et05_real_noisy.flist +fi + +# make a dot format from json annotation files +cp $trans_dir/tr05_real.dot_all tr05_real.dot +cp $trans_dir/dt05_real.dot_all dt05_real.dot +if $eval_flag; then +cp $trans_dir/et05_real.dot_all et05_real.dot +fi + +# make a scp file from file list +for x in $list_set; do + cat $x.flist | awk -F'[/]' '{print $NF}'| sed -e 's/\.wav/_REAL/' > ${x}_wav.ids + paste -d" " ${x}_wav.ids $x.flist | sort -k 1 > ${x}_wav.scp +done + +#make a transcription from dot +cat tr05_real.dot | sed -e 's/(\(.*\))/\1/' | awk '{print $NF ".CH5_REAL"}'> tr05_real_noisy.ids +cat tr05_real.dot | sed -e 's/(.*)//' > tr05_real_noisy.txt +paste -d" " tr05_real_noisy.ids tr05_real_noisy.txt | sort -k 1 > tr05_real_noisy.trans1 +cat dt05_real.dot | sed -e 's/(\(.*\))/\1/' | awk '{print $NF ".CH5_REAL"}'> dt05_real_noisy.ids +cat dt05_real.dot | sed -e 's/(.*)//' > dt05_real_noisy.txt +paste -d" " dt05_real_noisy.ids dt05_real_noisy.txt | sort -k 1 > dt05_real_noisy.trans1 +if $eval_flag; then +cat et05_real.dot | sed -e 's/(\(.*\))/\1/' | awk '{print $NF ".CH5_REAL"}'> et05_real_noisy.ids +cat et05_real.dot | sed -e 's/(.*)//' > et05_real_noisy.txt +paste -d" " et05_real_noisy.ids et05_real_noisy.txt | sort -k 1 > et05_real_noisy.trans1 +fi + +# Do some basic normalization steps. At this point we don't remove OOVs-- +# that will be done inside the training scripts, as we'd like to make the +# data-preparation stage independent of the specific lexicon used. +noiseword=""; +for x in $list_set;do + cat $x.trans1 | $local/normalize_transcript.pl $noiseword \ + | sort > $x.txt || exit 1; +done + +# Make the utt2spk and spk2utt files. +for x in $list_set; do + cat ${x}_wav.scp | awk -F'_' '{print $1}' > $x.spk + cat ${x}_wav.scp | awk '{print $1}' > $x.utt + paste -d" " $x.utt $x.spk > $x.utt2spk + cat $x.utt2spk | $utils/utt2spk_to_spk2utt.pl > $x.spk2utt || exit 1; +done + +# copying data to data/... +for x in $list_set; do + mkdir -p ../../$x + cp ${x}_wav.scp ../../$x/wav.scp || exit 1; + cp ${x}.txt ../../$x/text || exit 1; + cp ${x}.spk2utt ../../$x/spk2utt || exit 1; + cp ${x}.utt2spk ../../$x/utt2spk || exit 1; +done + +echo "Data preparation succeeded" diff --git a/egs/chime4/s5_1ch/local/run_beamform_2ch_track.sh b/egs/chime4/s5_1ch/local/run_beamform_2ch_track.sh new file mode 100755 index 00000000000..29d7ee0ff5e --- /dev/null +++ b/egs/chime4/s5_1ch/local/run_beamform_2ch_track.sh @@ -0,0 +1,83 @@ +#!/bin/bash + +# Copyright 2015, Mitsubishi Electric Research Laboratories, MERL (Author: Shinji Watanabe) + +. ./cmd.sh +. ./path.sh + +# Config: +nj=10 +cmd=run.pl + +. utils/parse_options.sh || exit 1; + +if [ $# != 2 ]; then + echo "Wrong #arguments ($#, expected 3)" + echo "Usage: local/run_beamform_2ch_track.sh [options] " + echo "main options (for others, see top of script file)" + echo " --nj # number of parallel jobs" + echo " --cmd # Command to run in parallel with" + exit 1; +fi + +sdir=$1 +odir=$2 + +wdir=data/beamforming_2ch_track + +if [ -z $BEAMFORMIT ] ; then + export BEAMFORMIT=$KALDI_ROOT/tools/BeamformIt +fi +export PATH=${PATH}:$BEAMFORMIT +! hash BeamformIt && echo "Missing BeamformIt, run 'cd ../../../tools/; make beamformit;'" && exit 1 + +# Set bash to 'debug' mode, it will exit on : +# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands', +set -e +set -u +set -o pipefail + +mkdir -p $odir +mkdir -p $wdir/log + +allwavs=`find $sdir/ | grep "\.wav" | tr ' ' '\n' | awk -F '/' '{print $(NF-1)"/"$NF}'` + +# wavfiles.list can be used as the name of the output files +output_wavfiles=$wdir/wavfiles.list +echo $allwavs | tr ' ' '\n' | awk -F '.' '{print $1}' | sort | uniq > $output_wavfiles + +# channel list +input_arrays=$wdir/channels +echo $allwavs | tr ' ' '\n' | sort | awk 'NR%2==1' > $wdir/channels.1st +echo $allwavs | tr ' ' '\n' | sort | awk 'NR%2==0' > $wdir/channels.2nd +paste -d" " $output_wavfiles $wdir/channels.1st $wdir/channels.2nd > $input_arrays + +# split the list for parallel processing +split_wavfiles="" +for n in `seq $nj`; do + split_wavfiles="$split_wavfiles $output_wavfiles.$n" +done +utils/split_scp.pl $output_wavfiles $split_wavfiles || exit 1; + +echo -e "Beamforming\n" +# making a shell script for each job +for n in `seq $nj`; do +cat << EOF > $wdir/log/beamform.$n.sh +while read line; do + $BEAMFORMIT/BeamformIt -s \$line -c $input_arrays \ + --config_file `pwd`/conf/chime4.cfg \ + --source_dir $sdir \ + --result_dir $odir +done < $output_wavfiles.$n +EOF +done +# making a subdirectory for the output wav files +for x in `awk -F '/' '{print $1}' $output_wavfiles | sort | uniq`; do + mkdir -p $odir/$x +done + +chmod a+x $wdir/log/beamform.*.sh +$cmd JOB=1:$nj $wdir/log/beamform.JOB.log \ + $wdir/log/beamform.JOB.sh + +echo "`basename $0` Done." diff --git a/egs/chime4/s5_1ch/local/run_beamform_6ch_track.sh b/egs/chime4/s5_1ch/local/run_beamform_6ch_track.sh new file mode 100755 index 00000000000..92e7b95707f --- /dev/null +++ b/egs/chime4/s5_1ch/local/run_beamform_6ch_track.sh @@ -0,0 +1,100 @@ +#!/bin/bash + +# Copyright 2015, Mitsubishi Electric Research Laboratories, MERL (Author: Shinji Watanabe) + +. ./cmd.sh +. ./path.sh + +# Config: +nj=10 +cmd=run.pl +bmf="1 3 4 5 6" +eval_flag=false # make it true when the evaluation data are released + +. utils/parse_options.sh || exit 1; + +if [ $# != 2 ]; then + echo "Wrong #arguments ($#, expected 2)" + echo "Usage: local/run_beamform_6ch_track.sh [options] " + echo "main options (for others, see top of script file)" + echo " --nj # number of parallel jobs" + echo " --cmd # Command to run in parallel with" + echo " --bmf \"1 3 4 5 6\" # microphones used for beamforming (2th mic is omitted in default)" + exit 1; +fi + +sdir=$1 +odir=$2 +wdir=data/beamforming_`echo $bmf | tr ' ' '_'` + +if [ -z $BEAMFORMIT ] ; then + export BEAMFORMIT=$KALDI_ROOT/tools/BeamformIt +fi +export PATH=${PATH}:$BEAMFORMIT +! hash BeamformIt && echo "Missing BeamformIt, run 'cd ../../../tools/; make beamformit;'" && exit 1 + +# Set bash to 'debug' mode, it will exit on : +# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands', +set -e +set -u +set -o pipefail + +mkdir -p $odir +mkdir -p $wdir/log + +echo "Will use the following channels: $bmf" +# number of channels +numch=`echo $bmf | tr ' ' '\n' | wc -l` +echo "the number of channels: $numch" + +# wavfiles.list can be used as the name of the output files +# we only process dev and eval waves +output_wavfiles=$wdir/wavfiles.list +if $eval_flag; then + find $sdir/{dt,et}*{simu,real}/ | grep CH1.wav \ + | awk -F '/' '{print $(NF-1) "/" $NF}' | sed -e "s/\.CH1\.wav//" | sort > $output_wavfiles +else + find $sdir/dt*{simu,real}/ | grep CH1.wav \ + | awk -F '/' '{print $(NF-1) "/" $NF}' | sed -e "s/\.CH1\.wav//" | sort > $output_wavfiles +fi + +# this is an input file list of the microphones +# format: 1st_wav 2nd_wav ... nth_wav +input_arrays=$wdir/channels_$numch +for x in `cat $output_wavfiles`; do + echo -n "$x" + for ch in $bmf; do + echo -n " $x.CH$ch.wav" + done + echo "" +done > $input_arrays + +# split the list for parallel processing +split_wavfiles="" +for n in `seq $nj`; do + split_wavfiles="$split_wavfiles $output_wavfiles.$n" +done +utils/split_scp.pl $output_wavfiles $split_wavfiles || exit 1; + +echo -e "Beamforming\n" +# making a shell script for each job +for n in `seq $nj`; do +cat << EOF > $wdir/log/beamform.$n.sh +while read line; do + $BEAMFORMIT/BeamformIt -s \$line -c $input_arrays \ + --config_file `pwd`/conf/chime4.cfg \ + --source_dir $sdir \ + --result_dir $odir +done < $output_wavfiles.$n +EOF +done +# making a subdirectory for the output wav files +for x in `awk -F '/' '{print $1}' $output_wavfiles | sort | uniq`; do + mkdir -p $odir/$x +done + +chmod a+x $wdir/log/beamform.*.sh +$cmd JOB=1:$nj $wdir/log/beamform.JOB.log \ + $wdir/log/beamform.JOB.sh + +echo "`basename $0` Done." diff --git a/egs/chime4/s5_1ch/local/run_dnn.sh b/egs/chime4/s5_1ch/local/run_dnn.sh new file mode 100755 index 00000000000..db6437258fc --- /dev/null +++ b/egs/chime4/s5_1ch/local/run_dnn.sh @@ -0,0 +1,237 @@ +#!/bin/bash + +# Copyright 2016 University of Sheffield (Jon Barker, Ricard Marxer) +# Inria (Emmanuel Vincent) +# Mitsubishi Electric Research Labs (Shinji Watanabe) +# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) + +# This script is made from the kaldi recipe of the 2nd CHiME Challenge Track 2 +# made by Chao Weng + +. ./path.sh +. ./cmd.sh ## You'll want to change cmd.sh to something that will work on your system. + ## This relates to the queue. + +# Config: +nj=30 +stage=0 # resume training with --stage=N +train=noisy +eval_flag=false # make it true when the evaluation data are released + +. utils/parse_options.sh || exit 1; + +# This is a shell script, but it's recommended that you run the commands one by +# one by copying and pasting into the shell. + +if [ $# -ne 1 ]; then + printf "\nUSAGE: %s \n\n" `basename $0` + echo "First argument specifies a unique name for different enhancement method" + exit 1; +fi + +# set enhanced data +enhan=$1 + +# Set bash to 'debug' mode, it will exit on : +# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands', +set -e +set -u +set -o pipefail + +# check whether run_init is executed +if [ ! -d data/lang ]; then + echo "error, execute local/run_init.sh, first" + exit 1; +fi + +# check whether run_init is executed +if [ ! -d exp/tri3b_tr05_multi_${train} ]; then + echo "error, execute local/run_init.sh, first" + exit 1; +fi + +# get alignments +if [ $stage -le 0 ]; then + steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \ + data/tr05_multi_${train} data/lang exp/tri3b_tr05_multi_${train} exp/tri3b_tr05_multi_${train}_ali + steps/align_fmllr.sh --nj 4 --cmd "$train_cmd" \ + data/dt05_multi_$enhan data/lang exp/tri3b_tr05_multi_${train} exp/tri3b_tr05_multi_${train}_ali_dt05 +fi + +# make fmllr feature for training multi = simu + real +gmmdir=exp/tri3b_tr05_multi_${train}_ali +data_fmllr=data-fmllr-tri3b +mkdir -p $data_fmllr +fmllrdir=fmllr-tri3b/${train} +if [ $stage -le 1 ]; then + for x in tr05_real_${train} tr05_simu_${train}; do + steps/nnet/make_fmllr_feats.sh --nj 4 --cmd "$train_cmd" \ + --transform-dir $gmmdir \ + $data_fmllr/$x data/$x $gmmdir exp/make_fmllr_tri3b/$x $fmllrdir + done +fi + +# make fmllr feature for dev and eval +gmmdir=exp/tri3b_tr05_multi_${train} +fmllrdir=fmllr-tri3b/$enhan +if [ $stage -le 2 ]; then + if $eval_flag; then + tasks="dt05_real_$enhan dt05_simu_$enhan et05_real_$enhan et05_simu_$enhan" + else + tasks="dt05_real_$enhan dt05_simu_$enhan" + fi + for x in $tasks; do + steps/nnet/make_fmllr_feats.sh --nj 4 --cmd "$train_cmd" \ + --transform-dir $gmmdir/decode_tgpr_5k_$x \ + $data_fmllr/$x data/$x $gmmdir exp/make_fmllr_tri3b/$x $fmllrdir + done +fi + +# make mixed training set from real and simulation enhanced data +# multi = simu + real +if [ $stage -le 3 ]; then + for data_dir in $data_fmllr/tr05_real_${train} $data_fmllr/tr05_simu_${train} $data_fmllr/dt05_real_$enhan $data_fmllr/dt05_simu_$enhan; do + utils/data/get_utt2dur.sh $data_dir + done + + utils/combine_data.sh $data_fmllr/tr05_multi_${train} $data_fmllr/tr05_simu_${train} $data_fmllr/tr05_real_${train} + utils/combine_data.sh $data_fmllr/dt05_multi_$enhan $data_fmllr/dt05_simu_$enhan $data_fmllr/dt05_real_$enhan + if $eval_flag; then + for data_dir in $data_fmllr/et05_real_$enhan $data_fmllr/et05_simu_$enhan; do + utils/data/get_utt2dur.sh $data_dir + done + utils/combine_data.sh $data_fmllr/et05_multi_$enhan $data_fmllr/et05_simu_$enhan $data_fmllr/et05_real_$enhan + fi +fi + +# pre-train dnn +dir=exp/tri4a_dnn_pretrain_tr05_multi_${train} +if [ $stage -le 4 ]; then + $cuda_cmd $dir/_pretrain_dbn.log \ + steps/nnet/pretrain_dbn.sh --nn-depth 7 --rbm-iter 3 $data_fmllr/tr05_multi_${train} $dir +fi + +# train dnn +dir=exp/tri4a_dnn_tr05_multi_${train} +ali=exp/tri3b_tr05_multi_${train}_ali +ali_dev=exp/tri3b_tr05_multi_${train}_ali_dt05 +feature_transform=exp/tri4a_dnn_pretrain_tr05_multi_${train}/final.feature_transform +dbn=exp/tri4a_dnn_pretrain_tr05_multi_${train}/7.dbn +if [ $stage -le 5 ]; then + $cuda_cmd $dir/_train_nnet.log \ + steps/nnet/train.sh --feature-transform $feature_transform --dbn $dbn --hid-layers 0 --learn-rate 0.008 \ + $data_fmllr/tr05_multi_${train} $data_fmllr/dt05_multi_$enhan data/lang $ali $ali_dev $dir +fi + +# decode enhanced speech +if [ $stage -le 6 ]; then + utils/mkgraph.sh data/lang_test_tgpr_5k $dir $dir/graph_tgpr_5k + steps/nnet/decode.sh --nj 4 --num-threads 3 --cmd "$decode_cmd" --acwt 0.10 --config conf/decode_dnn.config \ + $dir/graph_tgpr_5k $data_fmllr/dt05_real_$enhan $dir/decode_tgpr_5k_dt05_real_$enhan & + steps/nnet/decode.sh --nj 4 --num-threads 3 --cmd "$decode_cmd" --acwt 0.10 --config conf/decode_dnn.config \ + $dir/graph_tgpr_5k $data_fmllr/dt05_simu_$enhan $dir/decode_tgpr_5k_dt05_simu_$enhan & + if $eval_flag; then + steps/nnet/decode.sh --nj 4 --num-threads 3 --cmd "$decode_cmd" --acwt 0.10 --config conf/decode_dnn.config \ + $dir/graph_tgpr_5k $data_fmllr/et05_real_$enhan $dir/decode_tgpr_5k_et05_real_$enhan & + steps/nnet/decode.sh --nj 4 --num-threads 3 --cmd "$decode_cmd" --acwt 0.10 --config conf/decode_dnn.config \ + $dir/graph_tgpr_5k $data_fmllr/et05_simu_$enhan $dir/decode_tgpr_5k_et05_simu_$enhan & + fi + wait; +fi + +# Sequence training using sMBR criterion, we do Stochastic-GD +# with per-utterance updates. We use usually good acwt 0.1 +# Lattices are re-generated after 1st epoch, to get faster convergence. +dir=exp/tri4a_dnn_tr05_multi_${train}_smbr +srcdir=exp/tri4a_dnn_tr05_multi_${train} +acwt=0.1 + +# First we generate lattices and alignments: +# gawk must be installed to perform awk -v FS="/" '{ print gensub(".gz","","",$NF)" gunzip -c "$0" |"; }' in +# steps/nnet/make_denlats.sh +if [ $stage -le 7 ]; then + steps/nnet/align.sh --nj $nj --cmd "$train_cmd" \ + $data_fmllr/tr05_multi_${train} data/lang $srcdir ${srcdir}_ali + steps/nnet/make_denlats.sh --nj $nj --cmd "$decode_cmd" --config conf/decode_dnn.config --acwt $acwt \ + $data_fmllr/tr05_multi_${train} data/lang $srcdir ${srcdir}_denlats +fi + +# Re-train the DNN by 1 iteration of sMBR +if [ $stage -le 8 ]; then + steps/nnet/train_mpe.sh --cmd "$cuda_cmd" --num-iters 1 --acwt $acwt --do-smbr true \ + $data_fmllr/tr05_multi_${train} data/lang $srcdir ${srcdir}_ali ${srcdir}_denlats $dir +fi + +# Decode (reuse HCLG graph) +if [ $stage -le 9 ]; then + for ITER in 1; do + steps/nnet/decode.sh --nj 4 --num-threads 3 --cmd "$decode_cmd" --config conf/decode_dnn.config \ + --nnet $dir/${ITER}.nnet --acwt $acwt \ + exp/tri4a_dnn_tr05_multi_${train}/graph_tgpr_5k $data_fmllr/dt05_real_${enhan} $dir/decode_tgpr_5k_dt05_real_${enhan}_it${ITER} & + steps/nnet/decode.sh --nj 4 --num-threads 3 --cmd "$decode_cmd" --config conf/decode_dnn.config \ + --nnet $dir/${ITER}.nnet --acwt $acwt \ + exp/tri4a_dnn_tr05_multi_${train}/graph_tgpr_5k $data_fmllr/dt05_simu_${enhan} $dir/decode_tgpr_5k_dt05_simu_${enhan}_it${ITER} & + if $eval_flag; then + steps/nnet/decode.sh --nj 4 --num-threads 3 --cmd "$decode_cmd" --config conf/decode_dnn.config \ + --nnet $dir/${ITER}.nnet --acwt $acwt \ + exp/tri4a_dnn_tr05_multi_${train}/graph_tgpr_5k $data_fmllr/et05_real_${enhan} $dir/decode_tgpr_5k_et05_real_${enhan}_it${ITER} & + steps/nnet/decode.sh --nj 4 --num-threads 3 --cmd "$decode_cmd" --config conf/decode_dnn.config \ + --nnet $dir/${ITER}.nnet --acwt $acwt \ + exp/tri4a_dnn_tr05_multi_${train}/graph_tgpr_5k $data_fmllr/et05_simu_${enhan} $dir/decode_tgpr_5k_et05_simu_${enhan}_it${ITER} & + fi + done +fi + +# Re-generate lattices, run 4 more sMBR iterations +dir=exp/tri4a_dnn_tr05_multi_${train}_smbr_i1lats +srcdir=exp/tri4a_dnn_tr05_multi_${train}_smbr +acwt=0.1 + +# Generate lattices and alignments: +if [ $stage -le 10 ]; then + steps/nnet/align.sh --nj $nj --cmd "$train_cmd" \ + $data_fmllr/tr05_multi_${train} data/lang $srcdir ${srcdir}_ali + steps/nnet/make_denlats.sh --nj $nj --cmd "$decode_cmd" --config conf/decode_dnn.config --acwt $acwt \ + $data_fmllr/tr05_multi_${train} data/lang $srcdir ${srcdir}_denlats +fi + +# Re-train the DNN by 4 iterations of sMBR +if [ $stage -le 11 ]; then + steps/nnet/train_mpe.sh --cmd "$cuda_cmd" --num-iters 4 --acwt $acwt --do-smbr true \ + $data_fmllr/tr05_multi_${train} data/lang $srcdir ${srcdir}_ali ${srcdir}_denlats $dir || exit 1 +fi + +# Decode (reuse HCLG graph) +if [ $stage -le 12 ]; then + for ITER in 1 2 3 4; do + steps/nnet/decode.sh --nj 4 --num-threads 3 --cmd "$decode_cmd" --config conf/decode_dnn.config \ + --nnet $dir/${ITER}.nnet --acwt $acwt \ + exp/tri4a_dnn_tr05_multi_${train}/graph_tgpr_5k $data_fmllr/dt05_real_${enhan} $dir/decode_tgpr_5k_dt05_real_${enhan}_it${ITER} & + steps/nnet/decode.sh --nj 4 --num-threads 3 --cmd "$decode_cmd" --config conf/decode_dnn.config \ + --nnet $dir/${ITER}.nnet --acwt $acwt \ + exp/tri4a_dnn_tr05_multi_${train}/graph_tgpr_5k $data_fmllr/dt05_simu_${enhan} $dir/decode_tgpr_5k_dt05_simu_${enhan}_it${ITER} & + if $eval_flag; then + steps/nnet/decode.sh --nj 4 --num-threads 3 --cmd "$decode_cmd" --config conf/decode_dnn.config \ + --nnet $dir/${ITER}.nnet --acwt $acwt \ + exp/tri4a_dnn_tr05_multi_${train}/graph_tgpr_5k $data_fmllr/et05_real_${enhan} $dir/decode_tgpr_5k_et05_real_${enhan}_it${ITER} & + steps/nnet/decode.sh --nj 4 --num-threads 3 --cmd "$decode_cmd" --config conf/decode_dnn.config \ + --nnet $dir/${ITER}.nnet --acwt $acwt \ + exp/tri4a_dnn_tr05_multi_${train}/graph_tgpr_5k $data_fmllr/et05_simu_${enhan} $dir/decode_tgpr_5k_et05_simu_${enhan}_it${ITER} & + fi + done + wait +fi + +# scoring +if [ $stage -le 13 ]; then + # decoded results of enhanced speech using DNN AMs trained with enhanced data + local/chime4_calc_wers.sh exp/tri4a_dnn_tr05_multi_${train} $enhan exp/tri4a_dnn_tr05_multi_${train}/graph_tgpr_5k \ + > exp/tri4a_dnn_tr05_multi_${train}/best_wer_$enhan.result + head -n 15 exp/tri4a_dnn_tr05_multi_${train}/best_wer_$enhan.result + # decoded results of enhanced speech using sequence-training DNN + ./local/chime4_calc_wers_smbr.sh exp/tri4a_dnn_tr05_multi_${train}_smbr_i1lats ${enhan} exp/tri4a_dnn_tr05_multi_${train}/graph_tgpr_5k \ + > exp/tri4a_dnn_tr05_multi_${train}_smbr_i1lats/best_wer_${enhan}.result + head -n 15 exp/tri4a_dnn_tr05_multi_${train}_smbr_i1lats/best_wer_${enhan}.result +fi + +echo "`basename $0` Done." diff --git a/egs/chime4/s5_1ch/local/run_dnn_recog.sh b/egs/chime4/s5_1ch/local/run_dnn_recog.sh new file mode 100755 index 00000000000..5c9c1010fb2 --- /dev/null +++ b/egs/chime4/s5_1ch/local/run_dnn_recog.sh @@ -0,0 +1,143 @@ +#!/bin/bash + +# Copyright 2016 University of Sheffield (Jon Barker, Ricard Marxer) +# Inria (Emmanuel Vincent) +# Mitsubishi Electric Research Labs (Shinji Watanabe) +# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) + +# This script is made from the kaldi recipe of the 2nd CHiME Challenge Track 2 +# made by Chao Weng + +. ./path.sh +. ./cmd.sh ## You'll want to change cmd.sh to something that will work on your system. + ## This relates to the queue. + +# Config: +nj=30 +stage=0 # resume training with --stage=N +train=noisy +eval_flag=false # make it true when the evaluation data are released + +. utils/parse_options.sh || exit 1; + +# This is a shell script, but it's recommended that you run the commands one by +# one by copying and pasting into the shell. + +if [ $# -ne 2 ]; then + printf "\nUSAGE: %s \n\n" `basename $0` + echo "First argument specifies a unique name for different enhancement method" + echo "Second argument specifies acoustic and language model directory" + exit 1; +fi + +# set enhanced data +enhan=$1 +# set model directory +mdir=$2 + +# Set bash to 'debug' mode, it will exit on : +# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands', +set -e +set -u +set -o pipefail + +# check data/loca/data +if [ ! -d $mdir/data/local/data ]; then + echo "error, set $mdir correctly" + exit 1; +elif [ ! -d data/local/data ]; then + echo "copy $mdir/data/local/data" + mkdir -p data/local + cp -r $mdir/data/local/data data/local/ +fi + +# check gmm model +if [ ! -d $mdir/exp/tri3b_tr05_multi_${train} ]; then + echo "error, set $mdir correctly" + exit 1; +elif [ ! -d exp/tri3b_tr05_multi_${train} ]; then + echo "copy $mdir/exp/tri3b_tr05_multi_${train}" + mkdir -p exp + cp -r $mdir/exp/tri3b_tr05_multi_${train} exp/ +fi + +# check dnn graph +if [ ! -d $mdir/exp/tri4a_dnn_tr05_multi_${train}/graph_tgpr_5k ]; then + echo "error, set $mdir correctly" + exit 1; +elif [ ! -d exp/tri4a_dnn_tr05_multi_${train}/graph_tgpr_5k ]; then + echo "copy $mdir/exp/tri4a_dnn_tr05_multi_${train}/graph_tgpr_5k" + mkdir -p exp/tri4a_dnn_tr05_multi_${train} + cp -r $mdir/exp/tri4a_dnn_tr05_multi_${train}/graph_tgpr_5k exp/tri4a_dnn_tr05_multi_${train}/ +fi + +# check dnn smbr model +if [ ! -d $mdir/exp/tri4a_dnn_tr05_multi_${train}_smbr_i1lats ]; then + echo "error, set $mdir correctly" + exit 1; +elif [ ! -d exp/tri4a_dnn_tr05_multi_${train}_smbr_i1lats ]; then + echo "copy $mdir/exp/tri4a_dnn_tr05_multi_${train}_smbr_i1lats" + mkdir -p exp + cp -r $mdir/exp/tri4a_dnn_tr05_multi_${train}_smbr_i1lats exp/ +fi + +# make fmllr feature for dev and eval +gmmdir=exp/tri3b_tr05_multi_${train} +data_fmllr=data-fmllr-tri3b +mkdir -p $data_fmllr +fmllrdir=fmllr-tri3b/$enhan +if [ $stage -le 4 ]; then + if $eval_flag; then + tasks="dt05_real_$enhan dt05_simu_$enhan et05_real_$enhan et05_simu_$enhan" + else + tasks="dt05_real_$enhan dt05_simu_$enhan" + fi + for x in $tasks; do + steps/nnet/make_fmllr_feats.sh --nj 4 --cmd "$train_cmd" \ + --transform-dir $gmmdir/decode_tgpr_5k_$x \ + $data_fmllr/$x data/$x $gmmdir exp/make_fmllr_tri3b/$x $fmllrdir + done +fi + +# make mixed training set from real and simulation enhanced data +# multi = simu + real +if [ $stage -le 5 ]; then + utils/combine_data.sh $data_fmllr/dt05_multi_$enhan $data_fmllr/dt05_simu_$enhan $data_fmllr/dt05_real_$enhan + if $eval_flag; then + utils/combine_data.sh $data_fmllr/et05_multi_$enhan $data_fmllr/et05_simu_$enhan $data_fmllr/et05_real_$enhan + fi +fi + +# Re-generate lattices, run 4 more sMBR iterations +dir=exp/tri4a_dnn_tr05_multi_${train}_smbr_i1lats +acwt=0.1 + +# Decode (reuse HCLG graph) +if [ $stage -le 6 ]; then + for ITER in 1 2 3 4; do + steps/nnet/decode.sh --nj 4 --num-threads 3 --cmd "$decode_cmd" --config conf/decode_dnn.config \ + --nnet $dir/${ITER}.nnet --acwt $acwt \ + exp/tri4a_dnn_tr05_multi_${train}/graph_tgpr_5k $data_fmllr/dt05_real_${enhan} $dir/decode_tgpr_5k_dt05_real_${enhan}_it${ITER} & + steps/nnet/decode.sh --nj 4 --num-threads 3 --cmd "$decode_cmd" --config conf/decode_dnn.config \ + --nnet $dir/${ITER}.nnet --acwt $acwt \ + exp/tri4a_dnn_tr05_multi_${train}/graph_tgpr_5k $data_fmllr/dt05_simu_${enhan} $dir/decode_tgpr_5k_dt05_simu_${enhan}_it${ITER} & + if $eval_flag; then + steps/nnet/decode.sh --nj 4 --num-threads 3 --cmd "$decode_cmd" --config conf/decode_dnn.config \ + --nnet $dir/${ITER}.nnet --acwt $acwt \ + exp/tri4a_dnn_tr05_multi_${train}/graph_tgpr_5k $data_fmllr/et05_real_${enhan} $dir/decode_tgpr_5k_et05_real_${enhan}_it${ITER} & + steps/nnet/decode.sh --nj 4 --num-threads 3 --cmd "$decode_cmd" --config conf/decode_dnn.config \ + --nnet $dir/${ITER}.nnet --acwt $acwt \ + exp/tri4a_dnn_tr05_multi_${train}/graph_tgpr_5k $data_fmllr/et05_simu_${enhan} $dir/decode_tgpr_5k_et05_simu_${enhan}_it${ITER} & + fi + wait + done +fi + +# scoring +if [ $stage -le 7 ]; then + # decoded results of enhanced speech using sequence-training DNN + ./local/chime4_calc_wers_smbr.sh $dir ${enhan} exp/tri4a_dnn_tr05_multi_${train}/graph_tgpr_5k > $dir/best_wer_${enhan}.result + head -n 15 $dir/best_wer_${enhan}.result +fi + +echo "`basename $0` Done." diff --git a/egs/chime4/s5_1ch/local/run_gmm.sh b/egs/chime4/s5_1ch/local/run_gmm.sh new file mode 100755 index 00000000000..bedd6de51a5 --- /dev/null +++ b/egs/chime4/s5_1ch/local/run_gmm.sh @@ -0,0 +1,186 @@ +#!/bin/bash + +# Copyright 2016 University of Sheffield (Jon Barker, Ricard Marxer) +# Inria (Emmanuel Vincent) +# Mitsubishi Electric Research Labs (Shinji Watanabe) +# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) + +# This script is made from the kaldi recipe of the 2nd CHiME Challenge Track 2 +# made by Chao Weng + +. ./path.sh +. ./cmd.sh ## You'll want to change cmd.sh to something that will work on your system. + ## This relates to the queue. + +# Config: +nj=30 +stage=0 # resume training with --stage=N +train=noisy # noisy data multi-condition training +eval_flag=false # make it true when the evaluation data are released + +. utils/parse_options.sh || exit 1; + +# This is a shell script, but it's recommended that you run the commands one by +# one by copying and pasting into the shell. + +if [ $# -ne 3 ]; then + printf "\nUSAGE: %s \n\n" `basename $0` + echo "First argument specifies a unique name for different enhancement method" + echo "Second argument specifies the directory of enhanced wav files" + echo "Third argument specifies the CHiME4 root directory" + exit 1; +fi + +# set enhanced data +enhan=$1 +enhan_data=$2 +# set chime4 data +chime4_data=$3 + +# Set bash to 'debug' mode, it will exit on : +# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands', +set -e +set -u +set -o pipefail + +# check whether run_init is executed +if [ ! -d data/lang ]; then + echo "error, execute local/run_init.sh, first" + exit 1; +fi + +####################### +#### training ######### +if [ $stage -le 1 ]; then + # process for distant talking speech for real and simulation data + local/real_noisy_chime4_data_prep.sh $chime4_data + local/simu_noisy_chime4_data_prep.sh $chime4_data +fi + +# Now make MFCC features for clean, close, and noisy data +# mfccdir should be some place with a largish disk where you +# want to store MFCC features. +mfccdir=mfcc +if [ $stage -le 2 ]; then + if $eval_flag; then + tasks="tr05_real_${train} dt05_real_${train} tr05_simu_${train} dt05_simu_${train} et05_real_${train} et05_simu_${train}" + else + tasks="tr05_real_${train} dt05_real_${train} tr05_simu_${train} dt05_simu_${train}" + fi + for x in $tasks; do + steps/make_mfcc.sh --nj 8 --cmd "$train_cmd" \ + data/$x exp/make_mfcc/$x $mfccdir + steps/compute_cmvn_stats.sh data/$x exp/make_mfcc/$x $mfccdir + done +fi + +# make mixed training set from real and simulation training data +# multi = simu + real +if [ $stage -le 3 ]; then + utils/combine_data.sh data/tr05_multi_${train} data/tr05_simu_${train} data/tr05_real_${train} + utils/combine_data.sh data/dt05_multi_${train} data/dt05_simu_${train} data/dt05_real_${train} + if $eval_flag; then + utils/combine_data.sh data/et05_multi_${train} data/et05_simu_${train} data/et05_real_${train} + fi +fi + +# training models for noisy data +if [ $stage -le 4 ]; then + nspk=`wc -l data/tr05_multi_${train}/spk2utt | awk '{print $1}'` + if [ $nj -gt $nspk ]; then + nj2=$nspk + else + nj2=$nj + fi + # training monophone model + steps/train_mono.sh --boost-silence 1.25 --nj $nj2 --cmd "$train_cmd" \ + data/tr05_multi_${train} data/lang exp/mono0a_tr05_multi_${train} + steps/align_si.sh --boost-silence 1.25 --nj $nj2 --cmd "$train_cmd" \ + data/tr05_multi_${train} data/lang exp/mono0a_tr05_multi_${train} exp/mono0a_ali_tr05_multi_${train} + + # training triphone model with lad mllt features + steps/train_deltas.sh --boost-silence 1.25 --cmd "$train_cmd" \ + 2000 10000 data/tr05_multi_${train} data/lang exp/mono0a_ali_tr05_multi_${train} exp/tri1_tr05_multi_${train} + steps/align_si.sh --nj $nj2 --cmd "$train_cmd" \ + data/tr05_multi_${train} data/lang exp/tri1_tr05_multi_${train} exp/tri1_ali_tr05_multi_${train} + + steps/train_lda_mllt.sh --cmd "$train_cmd" \ + --splice-opts "--left-context=3 --right-context=3" \ + 2500 15000 data/tr05_multi_${train} data/lang exp/tri1_ali_tr05_multi_${train} exp/tri2b_tr05_multi_${train} + steps/align_si.sh --nj $nj2 --cmd "$train_cmd" \ + --use-graphs true data/tr05_multi_${train} data/lang exp/tri2b_tr05_multi_${train} exp/tri2b_ali_tr05_multi_${train} + + steps/train_sat.sh --cmd "$train_cmd" \ + 2500 15000 data/tr05_multi_${train} data/lang exp/tri2b_ali_tr05_multi_${train} exp/tri3b_tr05_multi_${train} + utils/mkgraph.sh data/lang_test_tgpr_5k exp/tri3b_tr05_multi_${train} exp/tri3b_tr05_multi_${train}/graph_tgpr_5k +fi +#### training done #### +####################### + + +##################### +#### tsting ######### +# process for enhanced data +if [ $stage -le 5 ]; then + if [ ! -d data/dt05_real_$enhan ]; then + local/real_enhan_chime4_data_prep.sh $enhan $enhan_data + local/simu_enhan_chime4_data_prep.sh $enhan $enhan_data + fi +fi + +# Now make MFCC features for enhanced data +# mfccdir should be some place with a largish disk where you +# want to store MFCC features. +mfccdir=mfcc/$enhan +if [ $stage -le 6 ]; then + if $eval_flag; then + tasks="dt05_real_$enhan dt05_simu_$enhan et05_real_$enhan et05_simu_$enhan" + else + tasks="dt05_real_$enhan dt05_simu_$enhan" + fi + for x in $tasks; do + if [ ! -e data/$x/feats.scp ]; then + steps/make_mfcc.sh --nj 8 --cmd "$train_cmd" \ + data/$x exp/make_mfcc/$x $mfccdir + steps/compute_cmvn_stats.sh data/$x exp/make_mfcc/$x $mfccdir + fi + done +fi + +# make mixed training set from real and simulation enhanced data +# multi = simu + real +if [ $stage -le 7 ]; then + if [ ! -d data/dt05_multi_$enhan ]; then + utils/combine_data.sh data/dt05_multi_$enhan data/dt05_simu_$enhan data/dt05_real_$enhan + if $eval_flag; then + utils/combine_data.sh data/et05_multi_$enhan data/et05_simu_$enhan data/et05_real_$enhan + fi + fi +fi + +# decode enhanced speech using AMs trained with enhanced data +if [ $stage -le 8 ]; then + steps/decode_fmllr.sh --nj 4 --num-threads 3 --cmd "$decode_cmd" \ + exp/tri3b_tr05_multi_${train}/graph_tgpr_5k data/dt05_real_$enhan exp/tri3b_tr05_multi_${train}/decode_tgpr_5k_dt05_real_$enhan & + steps/decode_fmllr.sh --nj 4 --num-threads 3 --cmd "$decode_cmd" \ + exp/tri3b_tr05_multi_${train}/graph_tgpr_5k data/dt05_simu_$enhan exp/tri3b_tr05_multi_${train}/decode_tgpr_5k_dt05_simu_$enhan & + if $eval_flag; then + steps/decode_fmllr.sh --nj 4 --num-threads 3 --cmd "$decode_cmd" \ + exp/tri3b_tr05_multi_${train}/graph_tgpr_5k data/et05_real_$enhan exp/tri3b_tr05_multi_${train}/decode_tgpr_5k_et05_real_$enhan & + steps/decode_fmllr.sh --nj 4 --num-threads 3 --cmd "$decode_cmd" \ + exp/tri3b_tr05_multi_${train}/graph_tgpr_5k data/et05_simu_$enhan exp/tri3b_tr05_multi_${train}/decode_tgpr_5k_et05_simu_$enhan & + fi + wait; +fi + +# scoring +if [ $stage -le 9 ]; then + # decoded results of enhanced speech using AMs trained with enhanced data + local/chime4_calc_wers.sh exp/tri3b_tr05_multi_${train} $enhan exp/tri3b_tr05_multi_${train}/graph_tgpr_5k \ + > exp/tri3b_tr05_multi_${train}/best_wer_$enhan.result + head -n 15 exp/tri3b_tr05_multi_${train}/best_wer_$enhan.result +fi +#### tsting done #### +##################### + +echo "`basename $0` Done." diff --git a/egs/chime4/s5_1ch/local/run_gmm_recog.sh b/egs/chime4/s5_1ch/local/run_gmm_recog.sh new file mode 100755 index 00000000000..8824aa255f4 --- /dev/null +++ b/egs/chime4/s5_1ch/local/run_gmm_recog.sh @@ -0,0 +1,127 @@ +#!/bin/bash + +# Copyright 2016 University of Sheffield (Jon Barker, Ricard Marxer) +# Inria (Emmanuel Vincent) +# Mitsubishi Electric Research Labs (Shinji Watanabe) +# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) + +# This script is made from the kaldi recipe of the 2nd CHiME Challenge Track 2 +# made by Chao Weng + +. ./path.sh +. ./cmd.sh ## You'll want to change cmd.sh to something that will work on your system. + ## This relates to the queue. + +# Config: +nj=30 +stage=0 # resume training with --stage=N +train=noisy +eval_flag=false # make it true when the evaluation data are released + +. utils/parse_options.sh || exit 1; + +# This is a shell script, but it's recommended that you run the commands one by +# one by copying and pasting into the shell. + +if [ $# -ne 3 ]; then + printf "\nUSAGE: %s \n\n" `basename $0` + echo "First argument specifies a unique name for different enhancement method" + echo "Second argument specifies the directory of enhanced wav files" + echo "Third argument specifies acoustic and language model directory" + exit 1; +fi + +# set enhanced data +enhan=$1 +enhan_data=$2 +# set model directory +mdir=$3 + +# Set bash to 'debug' mode, it will exit on : +# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands', +set -e +set -u +set -o pipefail + +# check data/loca/data +if [ ! -d $mdir/data/local/data ]; then + echo "error, set $mdir correctly" + exit 1; +elif [ ! -d data/local/data ]; then + echo "copy $mdir/data/local/data" + mkdir -p data/local + cp -r $mdir/data/local/data data/local/ +fi + +# check gmm model +if [ ! -d $mdir/exp/tri3b_tr05_multi_${train} ]; then + echo "error, set $mdir correctly" + exit 1; +elif [ ! -d exp/tri3b_tr05_multi_${train} ]; then + echo "copy $mdir/exp/tri3b_tr05_multi_${train}" + mkdir -p exp + cp -r $mdir/exp/tri3b_tr05_multi_${train} exp/ +fi + +# process for enhanced data +if [ $stage -le 0 ]; then + if [ ! -d data/dt05_real_$enhan ]; then + local/real_enhan_chime4_data_prep.sh $enhan $enhan_data + local/simu_enhan_chime4_data_prep.sh $enhan $enhan_data + fi +fi + +# Now make MFCC features for enhanced data +# mfccdir should be some place with a largish disk where you +# want to store MFCC features. +mfccdir=mfcc/$enhan +if [ $stage -le 1 ]; then + if $eval_flag; then + tasks="dt05_real_$enhan dt05_simu_$enhan et05_real_$enhan et05_simu_$enhan" + else + tasks="dt05_real_$enhan dt05_simu_$enhan" + fi + for x in $tasks; do + if [ ! -e data/$x/feats.scp ]; then + steps/make_mfcc.sh --nj 8 --cmd "$train_cmd" \ + data/$x exp/make_mfcc/$x $mfccdir + steps/compute_cmvn_stats.sh data/$x exp/make_mfcc/$x $mfccdir + fi + done +fi + +# make mixed training set from real and simulation enhanced data +# multi = simu + real +if [ $stage -le 2 ]; then + if [ ! -d data/dt05_multi_$enhan ]; then + utils/combine_data.sh data/dt05_multi_$enhan data/dt05_simu_$enhan data/dt05_real_$enhan + if $eval_flag; then + utils/combine_data.sh data/et05_multi_$enhan data/et05_simu_$enhan data/et05_real_$enhan + fi + fi +fi + +# decode enhanced speech using AMs trained with enhanced data +if [ $stage -le 3 ]; then + steps/decode_fmllr.sh --nj 4 --num-threads 3 --cmd "$decode_cmd" \ + exp/tri3b_tr05_multi_${train}/graph_tgpr_5k data/dt05_real_$enhan exp/tri3b_tr05_multi_${train}/decode_tgpr_5k_dt05_real_$enhan & + steps/decode_fmllr.sh --nj 4 --num-threads 3 --cmd "$decode_cmd" \ + exp/tri3b_tr05_multi_${train}/graph_tgpr_5k data/dt05_simu_$enhan exp/tri3b_tr05_multi_${train}/decode_tgpr_5k_dt05_simu_$enhan & + if $eval_flag; then + steps/decode_fmllr.sh --nj 4 --num-threads 3 --cmd "$decode_cmd" \ + exp/tri3b_tr05_multi_${train}/graph_tgpr_5k data/et05_real_$enhan exp/tri3b_tr05_multi_${train}/decode_tgpr_5k_et05_real_$enhan & + steps/decode_fmllr.sh --nj 4 --num-threads 3 --cmd "$decode_cmd" \ + exp/tri3b_tr05_multi_${train}/graph_tgpr_5k data/et05_simu_$enhan exp/tri3b_tr05_multi_${train}/decode_tgpr_5k_et05_simu_$enhan & + fi + wait; +fi + +# scoring +if [ $stage -le 4 ]; then + # decoded results of enhanced speech using AMs trained with enhanced data + local/chime4_calc_wers.sh exp/tri3b_tr05_multi_${train} $enhan exp/tri3b_tr05_multi_${train}/graph_tgpr_5k \ + > exp/tri3b_tr05_multi_${train}/best_wer_$enhan.result + head -n 15 exp/tri3b_tr05_multi_${train}/best_wer_$enhan.result +fi + +echo "`basename $0` Done." diff --git a/egs/chime4/s5_1ch/local/run_init.sh b/egs/chime4/s5_1ch/local/run_init.sh new file mode 100755 index 00000000000..3cafd7fbada --- /dev/null +++ b/egs/chime4/s5_1ch/local/run_init.sh @@ -0,0 +1,52 @@ +#!/bin/bash + +# Copyright 2016 University of Sheffield (Jon Barker, Ricard Marxer) +# Inria (Emmanuel Vincent) +# Mitsubishi Electric Research Labs (Shinji Watanabe) +# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) + +# Config: +nj=30 +stage=0 # resume training with --stage=N +eval_flag=false # make it true when the evaluation data are released + +. utils/parse_options.sh || exit 1; + +# This script is made from the kaldi recipe of the 2nd CHiME Challenge Track 2 +# made by Chao Weng + +. ./path.sh +. ./cmd.sh ## You'll want to change cmd.sh to something that will work on your system. + ## This relates to the queue. + +if [ $# -ne 1 ]; then + printf "\nUSAGE: %s \n\n" `basename $0` + echo "Please specifies a CHiME4 root directory" + echo "If you use scripts distributed in the CHiME4 package," + echo "It would be `pwd`/../.." + exit 1; +fi + +# This is a shell script, but it's recommended that you run the commands one by +# one by copying and pasting into the shell. + +# clean data +chime4_data=$1 +wsj0_data=$chime4_data/data/WSJ0 # directory of WSJ0 in Chime4. You can also specify your WSJ0 corpus directory + +# Set bash to 'debug' mode, it will exit on : +# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands', +set -e +set -u +set -o pipefail + +if [ $stage -le 0 ]; then + # process for clean speech and making LMs etc. from original WSJ0 + # note that training on clean data means original WSJ0 data only (no booth data) + local/clean_wsj0_data_prep.sh $wsj0_data + local/wsj_prepare_dict.sh + utils/prepare_lang.sh data/local/dict "" data/local/lang_tmp data/lang + local/clean_chime4_format_data.sh +fi + +echo "`basename $0` Done." diff --git a/egs/chime4/s5_1ch/local/run_lmrescore.sh b/egs/chime4/s5_1ch/local/run_lmrescore.sh new file mode 100755 index 00000000000..9ae66bdc3d6 --- /dev/null +++ b/egs/chime4/s5_1ch/local/run_lmrescore.sh @@ -0,0 +1,136 @@ +#!/bin/bash + +# Copyright 2015 University of Sheffield (Jon Barker, Ricard Marxer) +# Inria (Emmanuel Vincent) +# Mitsubishi Electric Research Labs (Shinji Watanabe) +# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) + +# Copyright 2015, Mitsubishi Electric Research Laboratories, MERL (Author: Takaaki Hori) + +nj=12 +stage=1 +order=5 +hidden=300 +rnnweight=0.5 +nbest=100 +train=noisy +eval_flag=false # make it true when the evaluation data are released + +. utils/parse_options.sh || exit 1; + +. ./path.sh +. ./cmd.sh ## You'll want to change cmd.sh to something that will work on your system. + ## This relates to the queue. + +# This is a shell script, but it's recommended that you run the commands one by +# one by copying and pasting into the shell. + +if [ $# -ne 2 ]; then + printf "\nUSAGE: %s \n\n" `basename $0` + echo "First argument specifies a root directory of Chime4 data" + echo "Second argument specifies a unique name for different enhancement method" + exit 1; +fi + +# set language models +lm_suffix=${order}gkn_5k +rnnlm_suffix=rnnlm_5k_h${hidden} + +# data root +chime4_data=$1 +# enhan data +enhan=$2 + +# check data +if [ ! -d $chime4_data ]; then + echo "$chime4_data does not exist. Please specify chime4 data root correctly" && exit 1 +fi + +# check whether run_dnn is executed +srcdir=exp/tri4a_dnn_tr05_multi_${train}_smbr_i1lats +if [ ! -d $srcdir ]; then + echo "error, execute local/run_dnn.sh, first" + exit 1; +fi + +# train a high-order n-gram language model +if [ $stage -le 1 ]; then + local/chime4_train_lms.sh $chime4_data || exit 1; +fi + +# train a RNN language model +if [ $stage -le 2 ]; then + local/chime4_train_rnnlms.sh $chime4_data || exit 1; +fi + +# preparation +dir=exp/tri4a_dnn_tr05_multi_${train}_smbr_lmrescore +mkdir -p $dir +# make a symbolic link to graph info +if [ ! -e $dir/graph_tgpr_5k ]; then + if [ ! -e exp/tri4a_dnn_tr05_multi_${train}/graph_tgpr_5k ]; then + echo "graph is missing, execute local/run_dnn.sh, correctly" + exit 1; + fi + pushd . ; cd $dir + ln -s ../tri4a_dnn_tr05_multi_${train}/graph_tgpr_5k . + popd +fi + +# rescore lattices by a high-order N-gram +if [ $stage -le 3 ]; then + # check the best iteration + if [ ! -f $srcdir/log/best_wer_$enhan ]; then + echo "error, execute local/run_dnn.sh, first" + exit 1; + fi + it=`cut -f 1 -d" " $srcdir/log/best_wer_$enhan | awk -F'[_]' '{print $1}'` + # rescore lattices + if $eval_flag; then + tasks="dt05_simu dt05_real et05_simu et05_real" + else + tasks="dt05_simu dt05_real" + fi + for t in $tasks; do + steps/lmrescore.sh --mode 3 \ + data/lang_test_tgpr_5k \ + data/lang_test_${lm_suffix} \ + data-fmllr-tri3b/${t}_$enhan \ + $srcdir/decode_tgpr_5k_${t}_${enhan}_it$it \ + $dir/decode_tgpr_5k_${t}_${enhan}_${lm_suffix} + done + # rescored results by high-order n-gram LM + mkdir -p $dir/log + local/chime4_calc_wers.sh $dir ${enhan}_${lm_suffix} $dir/graph_tgpr_5k \ + > $dir/best_wer_${enhan}_${lm_suffix}.result + head -n 15 $dir/best_wer_${enhan}_${lm_suffix}.result +fi + +# N-best rescoring using a RNNLM +if [ $stage -le 4 ]; then + # check the best lmw + if [ ! -f $dir/log/best_wer_${enhan}_${lm_suffix} ]; then + echo "error, rescoring with a high-order n-gram seems to be failed" + exit 1; + fi + lmw=`cut -f 1 -d" " $dir/log/best_wer_${enhan}_${lm_suffix} | awk -F'[_]' '{print $NF}'` + # rescore n-best list for all sets + if $eval_flag; then + tasks="dt05_simu dt05_real et05_simu et05_real" + else + tasks="dt05_simu dt05_real" + fi + for t in $tasks; do + steps/rnnlmrescore.sh --inv-acwt $lmw --N $nbest --use-phi true \ + $rnnweight \ + data/lang_test_${lm_suffix} \ + data/lang_test_${rnnlm_suffix} \ + data-fmllr-tri3b/${t}_$enhan \ + $dir/decode_tgpr_5k_${t}_${enhan}_${lm_suffix} \ + $dir/decode_tgpr_5k_${t}_${enhan}_${rnnlm_suffix}_w${rnnweight}_n${nbest} + done + # calc wers for RNNLM results + local/chime4_calc_wers.sh $dir ${enhan}_${rnnlm_suffix}_w${rnnweight}_n${nbest} $dir/graph_tgpr_5k \ + > $dir/best_wer_${enhan}_${rnnlm_suffix}_w${rnnweight}_n${nbest}.result + head -n 15 $dir/best_wer_${enhan}_${rnnlm_suffix}_w${rnnweight}_n${nbest}.result +fi diff --git a/egs/chime4/s5_1ch/local/run_lmrescore_recog.sh b/egs/chime4/s5_1ch/local/run_lmrescore_recog.sh new file mode 100755 index 00000000000..c7d62530d19 --- /dev/null +++ b/egs/chime4/s5_1ch/local/run_lmrescore_recog.sh @@ -0,0 +1,121 @@ +#!/bin/bash + +# Copyright 2015 University of Sheffield (Jon Barker, Ricard Marxer) +# Inria (Emmanuel Vincent) +# Mitsubishi Electric Research Labs (Shinji Watanabe) +# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) + +# Copyright 2015, Mitsubishi Electric Research Laboratories, MERL (Author: Takaaki Hori) + +nj=12 +stage=1 +order=5 +hidden=300 +rnnweight=0.5 +nbest=100 +train=noisy +eval_flag=false # make it true when the evaluation data are released + +. utils/parse_options.sh || exit 1; + +. ./path.sh +. ./cmd.sh ## You'll want to change cmd.sh to something that will work on your system. + ## This relates to the queue. + +# This is a shell script, but it's recommended that you run the commands one by +# one by copying and pasting into the shell. + +if [ $# -ne 2 ]; then + printf "\nUSAGE: %s \n\n" `basename $0` + echo "First argument specifies a unique name for different enhancement method" + echo "Second argument specifies acoustic and language model directory" + exit 1; +fi + +# set language models +lm_suffix=${order}gkn_5k +rnnlm_suffix=rnnlm_5k_h${hidden} + +# enhan data +enhan=$1 +# set model directory +mdir=$2 +srcdir=exp/tri4a_dnn_tr05_multi_${train}_smbr_i1lats + +# check language models +if [ ! -d $mdir/data/lang ]; then + echo "error, set $mdir correctly" + exit 1; +fi + +# preparation +dir=exp/tri4a_dnn_tr05_multi_${train}_smbr_lmrescore +mkdir -p $dir +# make a symbolic link to graph info +if [ ! -e $dir/graph_tgpr_5k ]; then + if [ ! -e exp/tri4a_dnn_tr05_multi_${train}/graph_tgpr_5k ]; then + echo "graph is missing, execute local/run_dnn.sh, correctly" + exit 1; + fi + pushd . ; cd $dir + ln -s ../tri4a_dnn_tr05_multi_${train}/graph_tgpr_5k . + popd +fi + +# rescore lattices by a high-order N-gram +if [ $stage -le 3 ]; then + # check the best iteration + if [ ! -f $srcdir/log/best_wer_$enhan ]; then + echo "$0: error $srcdir/log/best_wer_$enhan not found. execute local/run_dnn.sh, first" + exit 1; + fi + it=`cut -f 1 -d" " $srcdir/log/best_wer_$enhan | awk -F'[_]' '{print $1}'` + # rescore lattices + if $eval_flag; then + tasks="dt05_simu dt05_real et05_simu et05_real" + else + tasks="dt05_simu dt05_real" + fi + for t in $tasks; do + steps/lmrescore.sh --mode 3 \ + $mdir/data/lang_test_tgpr_5k \ + $mdir/data/lang_test_${lm_suffix} \ + data-fmllr-tri3b/${t}_$enhan \ + $srcdir/decode_tgpr_5k_${t}_${enhan}_it$it \ + $dir/decode_tgpr_5k_${t}_${enhan}_${lm_suffix} + done + # rescored results by high-order n-gram LM + mkdir -p $dir/log + local/chime4_calc_wers.sh $dir ${enhan}_${lm_suffix} $dir/graph_tgpr_5k \ + > $dir/best_wer_${enhan}_${lm_suffix}.result + head -n 15 $dir/best_wer_${enhan}_${lm_suffix}.result +fi + +# N-best rescoring using a RNNLM +if [ $stage -le 4 ]; then + # check the best lmw + if [ ! -f $dir/log/best_wer_${enhan}_${lm_suffix} ]; then + echo "error, rescoring with a high-order n-gram seems to be failed" + exit 1; + fi + lmw=`cut -f 1 -d" " $dir/log/best_wer_${enhan}_${lm_suffix} | awk -F'[_]' '{print $NF}'` + # rescore n-best list for all sets + if $eval_flag; then + tasks="dt05_simu dt05_real et05_simu et05_real" + else + tasks="dt05_simu dt05_real" + fi + for t in $tasks; do + steps/rnnlmrescore.sh --inv-acwt $lmw --N $nbest --use-phi true \ + $rnnweight \ + $mdir/data/lang_test_${lm_suffix} \ + $mdir/data/lang_test_${rnnlm_suffix} \ + data-fmllr-tri3b/${t}_$enhan \ + $dir/decode_tgpr_5k_${t}_${enhan}_${lm_suffix} \ + $dir/decode_tgpr_5k_${t}_${enhan}_${rnnlm_suffix}_w${rnnweight}_n${nbest} + done + # calc wers for RNNLM results + local/chime4_calc_wers.sh $dir ${enhan}_${rnnlm_suffix}_w${rnnweight}_n${nbest} $dir/graph_tgpr_5k \ + > $dir/best_wer_${enhan}_${rnnlm_suffix}_w${rnnweight}_n${nbest}.result + head -n 15 $dir/best_wer_${enhan}_${rnnlm_suffix}_w${rnnweight}_n${nbest}.result +fi diff --git a/egs/chime4/s5_1ch/local/score.sh b/egs/chime4/s5_1ch/local/score.sh new file mode 100755 index 00000000000..b18f350416d --- /dev/null +++ b/egs/chime4/s5_1ch/local/score.sh @@ -0,0 +1,67 @@ +#!/bin/bash +# Copyright 2012 Johns Hopkins University (Author: Daniel Povey) +# Apache 2.0 + +[ -f ./path.sh ] && . ./path.sh + +# begin configuration section. +cmd=run.pl +stage=0 +decode_mbr=true +reverse=false +word_ins_penalty=0.0 +min_lmwt=5 +max_lmwt=20 +#end configuration section. + +[ -f ./path.sh ] && . ./path.sh +. parse_options.sh || exit 1; + +if [ $# -ne 3 ]; then + echo "Usage: local/score.sh [--cmd (run.pl|queue.pl...)] " + echo " Options:" + echo " --cmd (run.pl|queue.pl...) # specify how to run the sub-processes." + echo " --stage (0|1|2) # start scoring script from part-way through." + echo " --decode_mbr (true/false) # maximum bayes risk decoding (confusion network)." + echo " --min_lmwt # minumum LM-weight for lattice rescoring " + echo " --max_lmwt # maximum LM-weight for lattice rescoring " + echo " --reverse (true/false) # score with time reversed features " + exit 1; +fi + +data=$1 +lang_or_graph=$2 +dir=$3 + +symtab=$lang_or_graph/words.txt + +for f in $symtab $dir/lat.1.gz $data/text; do + [ ! -f $f ] && echo "score.sh: no such file $f" && exit 1; +done + +mkdir -p $dir/scoring/log + +cat $data/text | sed 's:::g' | sed 's:::g' > $dir/scoring/test_filt.txt + +$cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/best_path.LMWT.log \ + lattice-scale --inv-acoustic-scale=LMWT "ark:gunzip -c $dir/lat.*.gz|" ark:- \| \ + lattice-add-penalty --word-ins-penalty=$word_ins_penalty ark:- ark:- \| \ + lattice-best-path --word-symbol-table=$symtab \ + ark:- ark,t:$dir/scoring/LMWT.tra || exit 1; + +if $reverse; then + for lmwt in `seq $min_lmwt $max_lmwt`; do + mv $dir/scoring/$lmwt.tra $dir/scoring/$lmwt.tra.orig + awk '{ printf("%s ",$1); for(i=NF; i>1; i--){ printf("%s ",$i); } printf("\n"); }' \ + <$dir/scoring/$lmwt.tra.orig >$dir/scoring/$lmwt.tra + done +fi + +# Note: the double level of quoting for the sed command +$cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/score.LMWT.log \ + cat $dir/scoring/LMWT.tra \| \ + utils/int2sym.pl -f 2- $symtab \| sed 's:\::g' \| \ + compute-wer --text --mode=present \ + ark:$dir/scoring/test_filt.txt ark,p:- ">&" $dir/wer_LMWT || exit 1; + +exit 0; diff --git a/egs/chime4/s5_1ch/local/simu_enhan_chime4_data_prep.sh b/egs/chime4/s5_1ch/local/simu_enhan_chime4_data_prep.sh new file mode 100755 index 00000000000..c9e4dc96cc6 --- /dev/null +++ b/egs/chime4/s5_1ch/local/simu_enhan_chime4_data_prep.sh @@ -0,0 +1,112 @@ +#!/bin/bash +set -e + +# Copyright 2009-2012 Microsoft Corporation Johns Hopkins University (Author: Daniel Povey) +# Apache 2.0. + +# This is modified from the script in standard Kaldi recipe to account +# for the way the WSJ data is structured on the Edinburgh systems. +# - Arnab Ghoshal, 29/05/12 + +# Modified from the script for CHiME2 baseline +# Shinji Watanabe 02/13/2015 + +# Config: +eval_flag=false # make it true when the evaluation data are released + +. utils/parse_options.sh || exit 1; + +if [ $# -ne 2 ]; then + printf "\nUSAGE: %s \n\n" `basename $0` + echo "The argument should be a the directory that only contains enhanced speech data." + exit 1; +fi + +echo "$0 $@" # Print the command line for logging + +enhan=$1 +audio_dir=$2 + +dir=`pwd`/data/local/data +mkdir -p $dir +local=`pwd`/local +utils=`pwd`/utils +odir=`pwd`/data + +. ./path.sh # Needed for KALDI_ROOT +export PATH=$PATH:$KALDI_ROOT/tools/irstlm/bin +sph2pipe=$KALDI_ROOT/tools/sph2pipe_v2.5/sph2pipe +if [ ! -x $sph2pipe ]; then + echo "Could not find (or execute) the sph2pipe program at $sph2pipe"; + exit 1; +fi + +if $eval_flag; then +list_set="tr05_simu_$enhan dt05_simu_$enhan et05_simu_$enhan" +else +list_set="tr05_simu_$enhan dt05_simu_$enhan" +fi + +cd $dir + +find $audio_dir/ -name '*.wav' | grep 'tr05_bus_simu\|tr05_caf_simu\|tr05_ped_simu\|tr05_str_simu' | sort -u > tr05_simu_$enhan.flist +find $audio_dir/ -name '*.wav' | grep 'dt05_bus_simu\|dt05_caf_simu\|dt05_ped_simu\|dt05_str_simu' | sort -u > dt05_simu_$enhan.flist +if $eval_flag; then +find $audio_dir/ -name '*.wav' | grep 'et05_bus_simu\|et05_caf_simu\|et05_ped_simu\|et05_str_simu' | sort -u > et05_simu_$enhan.flist +fi + +# make a scp file from file list +for x in $list_set; do + cat $x.flist | awk -F'[/]' '{print $NF}'| sed -e 's/\.wav/_SIMU/' > ${x}_wav.ids + paste -d" " ${x}_wav.ids $x.flist | sort -k 1 > ${x}_wav.scp +done + +# make a transcription from dot +# simulation training data extract dot file from original WSJ0 data +# since it is generated from these data +if [ ! -e dot_files.flist ]; then + echo "Could not find $dir/dot_files.flist files, first run local/clean_wsj0_data_prep.sh"; + exit 1; +fi +cat tr05_simu_${enhan}_wav.scp | awk -F'[_]' '{print $2}' | tr '[A-Z]' '[a-z]' \ + | $local/find_noisy_transcripts.pl dot_files.flist | cut -f 2- -d" " > tr05_simu_$enhan.txt +cat tr05_simu_${enhan}_wav.scp | cut -f 1 -d" " > tr05_simu_$enhan.ids +paste -d" " tr05_simu_$enhan.ids tr05_simu_$enhan.txt | sort -k 1 > tr05_simu_$enhan.trans1 +# dt05 and et05 simulation data are generated from the CHiME4 booth recording +# and we use CHiME4 dot files +cat dt05_simu.dot | sed -e 's/(\(.*\))/\1/' | awk '{print $NF "_SIMU"}'> dt05_simu_$enhan.ids +cat dt05_simu.dot | sed -e 's/(.*)//' > dt05_simu_$enhan.txt +paste -d" " dt05_simu_$enhan.ids dt05_simu_$enhan.txt | sort -k 1 > dt05_simu_$enhan.trans1 +if $eval_flag; then +cat et05_simu.dot | sed -e 's/(\(.*\))/\1/' | awk '{print $NF "_SIMU"}'> et05_simu_$enhan.ids +cat et05_simu.dot | sed -e 's/(.*)//' > et05_simu_$enhan.txt +paste -d" " et05_simu_$enhan.ids et05_simu_$enhan.txt | sort -k 1 > et05_simu_$enhan.trans1 +fi + +# Do some basic normalization steps. At this point we don't remove OOVs-- +# that will be done inside the training scripts, as we'd like to make the +# data-preparation stage independent of the specific lexicon used. +noiseword=""; +for x in $list_set;do + cat $x.trans1 | $local/normalize_transcript.pl $noiseword \ + | sort > $x.txt || exit 1; +done + +# Make the utt2spk and spk2utt files. +for x in $list_set; do + cat ${x}_wav.scp | awk -F'_' '{print $1}' > $x.spk + cat ${x}_wav.scp | awk '{print $1}' > $x.utt + paste -d" " $x.utt $x.spk > $x.utt2spk + cat $x.utt2spk | $utils/utt2spk_to_spk2utt.pl > $x.spk2utt || exit 1; +done + +# copying data to data/... +for x in $list_set; do + mkdir -p $odir/$x + cp ${x}_wav.scp $odir/$x/wav.scp || exit 1; + cp ${x}.txt $odir/$x/text || exit 1; + cp ${x}.spk2utt $odir/$x/spk2utt || exit 1; + cp ${x}.utt2spk $odir/$x/utt2spk || exit 1; +done + +echo "Data preparation succeeded" diff --git a/egs/chime4/s5_1ch/local/simu_noisy_chime4_data_prep.sh b/egs/chime4/s5_1ch/local/simu_noisy_chime4_data_prep.sh new file mode 100755 index 00000000000..6e7a827358e --- /dev/null +++ b/egs/chime4/s5_1ch/local/simu_noisy_chime4_data_prep.sh @@ -0,0 +1,122 @@ +#!/bin/bash +set -e + +# Copyright 2009-2012 Microsoft Corporation Johns Hopkins University (Author: Daniel Povey) +# Apache 2.0. + +# This is modified from the script in standard Kaldi recipe to account +# for the way the WSJ data is structured on the Edinburgh systems. +# - Arnab Ghoshal, 29/05/12 + +# Modified from the script for CHiME2 baseline +# Shinji Watanabe 02/13/2015 + +# Config: +eval_flag=false # make it true when the evaluation data are released + +. utils/parse_options.sh || exit 1; + +if [ $# -ne 1 ]; then + printf "\nUSAGE: %s \n\n" `basename $0` + echo "The argument should be a the top-level Chime4 directory." + echo "It is assumed that there will be a 'data' subdirectory" + echo "within the top-level corpus directory." + exit 1; +fi + +echo "$0 $@" # Print the command line for logging + +audio_dir=$1/data/audio/16kHz/isolated +trans_dir=$1/data/transcriptions + +echo "extract 5th channel (CH5.wav, the center bottom edge in the front of the tablet) for noisy data" + +dir=`pwd`/data/local/data +lmdir=`pwd`/data/local/nist_lm +mkdir -p $dir $lmdir +local=`pwd`/local +utils=`pwd`/utils + +. ./path.sh # Needed for KALDI_ROOT +export PATH=$PATH:$KALDI_ROOT/tools/irstlm/bin +sph2pipe=$KALDI_ROOT/tools/sph2pipe_v2.5/sph2pipe +if [ ! -x $sph2pipe ]; then + echo "Could not find (or execute) the sph2pipe program at $sph2pipe"; + exit 1; +fi + +if $eval_flag; then +list_set="tr05_simu_noisy dt05_simu_noisy et05_simu_noisy" +else +list_set="tr05_simu_noisy dt05_simu_noisy" +fi + +cd $dir + +find $audio_dir -name '*CH5.wav' | grep 'tr05_bus_simu\|tr05_caf_simu\|tr05_ped_simu\|tr05_str_simu' | sort -u > tr05_simu_noisy.flist +find $audio_dir -name '*CH5.wav' | grep 'dt05_bus_simu\|dt05_caf_simu\|dt05_ped_simu\|dt05_str_simu' | sort -u > dt05_simu_noisy.flist +if $eval_flag; then +find $audio_dir -name '*CH5.wav' | grep 'et05_bus_simu\|et05_caf_simu\|et05_ped_simu\|et05_str_simu' | sort -u > et05_simu_noisy.flist +fi + +# make a dot format from json annotation files +cp $trans_dir/dt05_simu.dot_all dt05_simu.dot +if $eval_flag; then +cp $trans_dir/et05_simu.dot_all et05_simu.dot +fi + +# make a scp file from file list +for x in $list_set; do + cat $x.flist | awk -F'[/]' '{print $NF}'| sed -e 's/\.wav/_SIMU/' > ${x}_wav.ids + paste -d" " ${x}_wav.ids $x.flist | sort -k 1 > ${x}_wav.scp +done + +# make a transcription from dot +# simulation training data extract dot file from original WSJ0 data +# since it is generated from these data +if [ ! -e dot_files.flist ]; then + echo "Could not find $dir/dot_files.flist files, first run local/clean_wsj0_data_prep.sh"; + exit 1; +fi +cat tr05_simu_noisy_wav.scp | awk -F'[_]' '{print $2}' | tr '[A-Z]' '[a-z]' \ + | $local/find_noisy_transcripts.pl dot_files.flist | cut -f 2- -d" " > tr05_simu_noisy.txt +cat tr05_simu_noisy_wav.scp | cut -f 1 -d" " > tr05_simu_noisy.ids +paste -d" " tr05_simu_noisy.ids tr05_simu_noisy.txt | sort -k 1 > tr05_simu_noisy.trans1 +# dt05 and et05 simulation data are generated from the CHiME4 booth recording +# and we use CHiME4 dot files +cat dt05_simu.dot | sed -e 's/(\(.*\))/\1/' | awk '{print $NF ".CH5_SIMU"}'> dt05_simu_noisy.ids +cat dt05_simu.dot | sed -e 's/(.*)//' > dt05_simu_noisy.txt +paste -d" " dt05_simu_noisy.ids dt05_simu_noisy.txt | sort -k 1 > dt05_simu_noisy.trans1 +if $eval_flag; then +cat et05_simu.dot | sed -e 's/(\(.*\))/\1/' | awk '{print $NF ".CH5_SIMU"}'> et05_simu_noisy.ids +cat et05_simu.dot | sed -e 's/(.*)//' > et05_simu_noisy.txt +paste -d" " et05_simu_noisy.ids et05_simu_noisy.txt | sort -k 1 > et05_simu_noisy.trans1 +fi + +# Do some basic normalization steps. At this point we don't remove OOVs-- +# that will be done inside the training scripts, as we'd like to make the +# data-preparation stage independent of the specific lexicon used. +noiseword=""; +for x in $list_set;do + cat $x.trans1 | $local/normalize_transcript.pl $noiseword \ + | sort > $x.txt || exit 1; +done + +# Make the utt2spk and spk2utt files. +for x in $list_set; do + cat ${x}_wav.scp | awk -F'_' '{print $1}' > $x.spk + cat ${x}_wav.scp | awk '{print $1}' > $x.utt + paste -d" " $x.utt $x.spk > $x.utt2spk + cat $x.utt2spk | $utils/utt2spk_to_spk2utt.pl > $x.spk2utt || exit 1; +done + +# copying data to data/... +for x in $list_set; do + mkdir -p ../../$x + cp ${x}_wav.scp ../../$x/wav.scp || exit 1; + cp ${x}.txt ../../$x/text || exit 1; + cp ${x}.spk2utt ../../$x/spk2utt || exit 1; + cp ${x}.utt2spk ../../$x/utt2spk || exit 1; +done + +echo "Data preparation succeeded" diff --git a/egs/chime4/s5_1ch/local/wsj_prepare_dict.sh b/egs/chime4/s5_1ch/local/wsj_prepare_dict.sh new file mode 100755 index 00000000000..6ddebd60293 --- /dev/null +++ b/egs/chime4/s5_1ch/local/wsj_prepare_dict.sh @@ -0,0 +1,86 @@ +#!/bin/bash + +# Copyright 2010-2012 Microsoft Corporation +# 2012-2014 Johns Hopkins University (Author: Daniel Povey) + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +# MERCHANTABLITY OR NON-INFRINGEMENT. +# See the Apache 2 License for the specific language governing permissions and +# limitations under the License. + +# Call this script from one level above, e.g. from the s3/ directory. It puts +# its output in data/local/. + +# The parts of the output of this that will be needed are +# [in data/local/dict/ ] +# lexicon.txt +# extra_questions.txt +# nonsilence_phones.txt +# optional_silence.txt +# silence_phones.txt + +# run this from ../ +dir=data/local/dict +mkdir -p $dir + + +# (1) Get the CMU dictionary +svn co https://svn.code.sf.net/p/cmusphinx/code/trunk/cmudict \ + $dir/cmudict || exit 1; + +# can add -r 10966 for strict compatibility. + + +#(2) Dictionary preparation: + + +# Make phones symbol-table (adding in silence and verbal and non-verbal noises at this point). +# We are adding suffixes _B, _E, _S for beginning, ending, and singleton phones. + +# silence phones, one per line. +(echo SIL; echo SPN; echo NSN) > $dir/silence_phones.txt +echo SIL > $dir/optional_silence.txt + +# nonsilence phones; on each line is a list of phones that correspond +# really to the same base phone. +cat $dir/cmudict/cmudict.0.7a.symbols | perl -ane 's:\r::; print;' | \ + perl -e 'while(<>){ + chop; m:^([^\d]+)(\d*)$: || die "Bad phone $_"; + $phones_of{$1} .= "$_ "; } + foreach $list (values %phones_of) {print $list . "\n"; } ' \ + | sort > $dir/nonsilence_phones.txt || exit 1; + +# A few extra questions that will be added to those obtained by automatically clustering +# the "real" phones. These ask about stress; there's also one for silence. +cat $dir/silence_phones.txt| awk '{printf("%s ", $1);} END{printf "\n";}' > $dir/extra_questions.txt || exit 1; +cat $dir/nonsilence_phones.txt | perl -e 'while(<>){ foreach $p (split(" ", $_)) { + $p =~ m:^([^\d]+)(\d*)$: || die "Bad phone $_"; $q{$2} .= "$p "; } } foreach $l (values %q) {print "$l\n";}' \ + >> $dir/extra_questions.txt || exit 1; + +grep -v ';;;' $dir/cmudict/cmudict.0.7a | \ + perl -ane 'if(!m:^;;;:){ s:(\S+)\(\d+\) :$1 :; print; }' \ + > $dir/lexicon1_raw_nosil.txt || exit 1; + +# Add to cmudict the silences, noises etc. + +# the sort | uniq is to remove a duplicated pron from cmudict. +(echo '!SIL SIL'; echo ' SPN'; echo ' SPN'; echo ' NSN'; ) | \ + cat - $dir/lexicon1_raw_nosil.txt | sort | uniq > $dir/lexicon2_raw.txt || exit 1; + + +# lexicon.txt is without the _B, _E, _S, _I markers. +# This is the input to wsj_format_data.sh +cp $dir/lexicon2_raw.txt $dir/lexicon.txt + +rm $dir/lexiconp.txt 2>/dev/null + +echo "Dictionary preparation succeeded" + diff --git a/egs/chime4/s5_1ch/path.sh b/egs/chime4/s5_1ch/path.sh new file mode 100755 index 00000000000..2d17b17a84a --- /dev/null +++ b/egs/chime4/s5_1ch/path.sh @@ -0,0 +1,6 @@ +export KALDI_ROOT=`pwd`/../../.. +[ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh +export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH +[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1 +. $KALDI_ROOT/tools/config/common_path.sh +export LC_ALL=C diff --git a/egs/chime4/s5_1ch/run.sh b/egs/chime4/s5_1ch/run.sh new file mode 100755 index 00000000000..012a7eefc81 --- /dev/null +++ b/egs/chime4/s5_1ch/run.sh @@ -0,0 +1,90 @@ +#!/bin/bash + +# Kaldi ASR baseline for the CHiME-4 Challenge (1ch track: single channel track) +# +# Copyright 2016 University of Sheffield (Jon Barker, Ricard Marxer) +# Inria (Emmanuel Vincent) +# Mitsubishi Electric Research Labs (Shinji Watanabe) +# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) + +. ./path.sh +. ./cmd.sh + +# Config: +stage=0 # resume training with --stage=N +flatstart=false + +. utils/parse_options.sh || exit 1; + +# Set bash to 'debug' mode, it will exit on : +# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands', +set -e +set -u +set -o pipefail + +#####check data and model paths################ +# Set a main root directory of the CHiME4 data +# If you use scripts distributed in the CHiME4 package, +chime4_data=`pwd`/../.. +# Otherwise, please specify it, e.g., +chime4_data=/db/laputa1/data/processed/public/CHiME4 +if [ ! -d $chime4_data ]; then + echo "$chime4_data does not exist. Please specify chime4 data root correctly" && exit 1 +fi +# Set a model directory for the CHiME4 data. +modeldir=$chime4_data/tools/ASR_models +for d in $modeldir $modeldir/data/{lang,lang_test_tgpr_5k,lang_test_5gkn_5k,lang_test_rnnlm_5k_h300,local} \ + $modeldir/exp/{tri3b_tr05_multi_noisy,tri4a_dnn_tr05_multi_noisy,tri4a_dnn_tr05_multi_noisy_smbr_i1lats}; do + [ ! -d ] && echo "$0: no such directory $d. specify models correctly or execute './run.sh --flatstart true' first" && exit 1; +done +#####check data and model paths finished####### + + +#####main program start################ +# You can execute run_init.sh only "once" +# This creates 3-gram LM, FSTs, and basic task files +if [ $stage -le 0 ] && $flatstart; then + local/run_init.sh $chime4_data +fi + +# In this script, we use non-enhanced 6th microphone signals. +enhancement_method=isolated_1ch_track +enhancement_data=$chime4_data/data/audio/16kHz/$enhancement_method +#if [ $stage -le 1 ]; then +# put your single channel enhancement +#fi + +# GMM based ASR experiment without "retraining" +# Please set a directory of your speech enhancement method. +# run_gmm_recog.sh can be done every time when you change a speech enhancement technique. +# The directory structure and audio files must follow the attached baseline enhancement directory +if [ $stage -le 2 ]; then + if $flatstart; then + local/run_gmm.sh $enhancement_method $enhancement_data $chime4_data + else + local/run_gmm_recog.sh $enhancement_method $enhancement_data $modeldir + fi +fi + +# DNN based ASR experiment +# Since it takes time to evaluate DNN, we make the GMM and DNN scripts separately. +# You may execute it after you would have promising results using GMM-based ASR experiments +if [ $stage -le 3 ]; then + if $flatstart; then + local/run_dnn.sh $enhancement_method + else + local/run_dnn_recog.sh $enhancement_method $modeldir + fi +fi + +# LM-rescoring experiment with 5-gram and RNN LMs +# It takes a few days to train a RNNLM. +if [ $stage -le 4 ]; then + if $flatstart; then + local/run_lmrescore.sh $chime4_data $enhancement_method + else + local/run_lmrescore_recog.sh $enhancement_method $modeldir + fi +fi + +echo "Done." diff --git a/egs/chime4/s5_1ch/steps b/egs/chime4/s5_1ch/steps new file mode 120000 index 00000000000..6e99bf5b5ad --- /dev/null +++ b/egs/chime4/s5_1ch/steps @@ -0,0 +1 @@ +../../wsj/s5/steps \ No newline at end of file diff --git a/egs/chime4/s5_1ch/utils b/egs/chime4/s5_1ch/utils new file mode 120000 index 00000000000..b240885218f --- /dev/null +++ b/egs/chime4/s5_1ch/utils @@ -0,0 +1 @@ +../../wsj/s5/utils \ No newline at end of file diff --git a/egs/chime4/s5_2ch/RESULTS b/egs/chime4/s5_2ch/RESULTS new file mode 100644 index 00000000000..81c18cccf07 --- /dev/null +++ b/egs/chime4/s5_2ch/RESULTS @@ -0,0 +1,49 @@ +# CHiME-4 2ch track results +# The result is based on Hori et al, "The MERL/SRI system for the 3rd CHiME challenge using beamforming, +# robust feature extraction, and advanced speech recognition," in Proc. ASRU'15, +# and please refer the paper if you think the baseline useful. +# Note that the following result is different from that in the paper since we don't include +# SRI's robust features and system combination + +GMM noisy multi-condition with beamformit +exp/tri3b_tr05_multi_noisy/best_wer_beamformit_2mics.result +------------------- +best overall dt05 WER 17.69% (language model weight = 11) +------------------- +dt05_simu WER: 19.15% (Average), 16.14% (BUS), 23.55% (CAFE), 15.49% (PEDESTRIAN), 21.42% (STREET) +------------------- +dt05_real WER: 16.22% (Average), 20.12% (BUS), 16.25% (CAFE), 12.35% (PEDESTRIAN), 16.18% (STREET) +------------------- + +DNN sMBR +exp/tri4a_dnn_tr05_multi_noisy_smbr_i1lats/best_wer_beamformit_2mics.result +------------------- +best overall dt05 WER 11.63% (language model weight = 11) + (Number of iterations = 4) +------------------- +dt05_simu WER: 12.36% (Average), 10.66% (BUS), 15.55% (CAFE), 9.87% (PEDESTRIAN), 13.36% (STREET) +------------------- +dt05_real WER: 10.90% (Average), 13.62% (BUS), 10.63% (CAFE), 7.69% (PEDESTRIAN), 11.65% (STREET) +------------------- + +5-gram rescoring +exp/tri4a_dnn_tr05_multi_noisy_smbr_lmrescore/best_wer_beamformit_2mics_5gkn_5k.result +------------------- +best overall dt05 WER 10.17% (language model weight = 11) +------------------- +dt05_simu WER: 10.72% (Average), 9.37% (BUS), 13.70% (CAFE), 8.07% (PEDESTRIAN), 11.73% (STREET) +------------------- +dt05_real WER: 9.63% (Average), 11.93% (BUS), 9.75% (CAFE), 6.46% (PEDESTRIAN), 10.37% (STREET) +------------------- + +RNNLM +exp/tri4a_dnn_tr05_multi_noisy_smbr_lmrescore/best_wer_beamformit_2mics_rnnlm_5k_h300_w0.5_n100.result +------------------- +best overall dt05 WER 8.86% (language model weight = 12) +------------------- +dt05_simu WER: 9.50% (Average), 8.19% (BUS), 12.15% (CAFE), 7.12% (PEDESTRIAN), 10.55% (STREET) +------------------- +dt05_real WER: 8.23% (Average), 10.90% (BUS), 7.96% (CAFE), 5.22% (PEDESTRIAN), 8.82% (STREET) +------------------- + + diff --git a/egs/chime4/s5_2ch/cmd.sh b/egs/chime4/s5_2ch/cmd.sh new file mode 100755 index 00000000000..2626a1a35b2 --- /dev/null +++ b/egs/chime4/s5_2ch/cmd.sh @@ -0,0 +1,21 @@ +# you can change cmd.sh depending on what type of queue you are using. +# If you have no queueing system and want to run on a local machine, you +# can change all instances 'queue.pl' to run.pl (but be careful and run +# commands one by one: most recipes will exhaust the memory on your +# machine). queue.pl works with GridEngine (qsub). slurm.pl works +# with slurm. Different queues are configured differently, with different +# queue names and different ways of specifying things like memory; +# to account for these differences you can create and edit the file +# conf/queue.conf to match your queue's configuration. Search for +# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, +# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. + +#export train_cmd="queue.pl --mem 2G" +#export decode_cmd="queue.pl --mem 4G" +#export mkgraph_cmd="queue.pl --mem 8G" + +# run it locally... +export train_cmd=run.pl +export decode_cmd=run.pl +export cuda_cmd=run.pl +export mkgraph_cmd=run.pl diff --git a/egs/chime4/s5_2ch/conf/chime4.cfg b/egs/chime4/s5_2ch/conf/chime4.cfg new file mode 100755 index 00000000000..70fdd858651 --- /dev/null +++ b/egs/chime4/s5_2ch/conf/chime4.cfg @@ -0,0 +1,50 @@ +#BeamformIt sample configuration file for AMI data (http://groups.inf.ed.ac.uk/ami/download/) + +# scrolling size to compute the delays +scroll_size = 250 + +# cross correlation computation window size +window_size = 500 + +#amount of maximum points for the xcorrelation taken into account +nbest_amount = 4 + +#flag wether to apply an automatic noise thresholding +do_noise_threshold = 1 + +#Percentage of frames with lower xcorr taken as noisy +noise_percent = 10 + +######## acoustic modelling parameters + +#transition probabilities weight for multichannel decoding +trans_weight_multi = 25 +trans_weight_nbest = 25 + +### + +#flag wether to print the feaures after setting them, or not +print_features = 1 + +#flag wether to use the bad frames in the sum process +do_avoid_bad_frames = 1 + +#flag to use the best channel (SNR) as a reference +#defined from command line +do_compute_reference = 1 + +#flag wether to use a uem file or not(process all the file) +do_use_uem_file = 0 + +#flag wether to use an adaptative weights scheme or fixed weights +do_adapt_weights = 1 + +#flag wether to output the sph files or just run the system to create the auxiliary files +do_write_sph_files = 1 + +####directories where to store/retrieve info#### +#channels_file = ./cfg-files/channels + +#show needs to be passed as argument normally, here a default one is given just in case +#show_id = Ttmp + diff --git a/egs/chime4/s5_2ch/conf/decode_dnn.config b/egs/chime4/s5_2ch/conf/decode_dnn.config new file mode 100644 index 00000000000..89dd9929a62 --- /dev/null +++ b/egs/chime4/s5_2ch/conf/decode_dnn.config @@ -0,0 +1,2 @@ +beam=18.0 # beam for decoding. Was 13.0 in the scripts. +lattice_beam=10.0 # this has most effect on size of the lattices. diff --git a/egs/chime4/s5_2ch/conf/fbank.conf b/egs/chime4/s5_2ch/conf/fbank.conf new file mode 100644 index 00000000000..5fc7774b31f --- /dev/null +++ b/egs/chime4/s5_2ch/conf/fbank.conf @@ -0,0 +1,11 @@ +# No non-default options for now. +--window-type=hamming # disable Dans window, use the standard +--use-energy=false # only fbank outputs +--sample-frequency=16000 # Cantonese is sampled at 8kHz + +--low-freq=64 # typical setup from Frantisek Grezl +--high-freq=8000 +--dither=1 + +--num-mel-bins=40 # 8kHz so we use 15 bins +--htk-compat=true # try to make it compatible with HTK diff --git a/egs/timit/s4/conf/mfcc.conf b/egs/chime4/s5_2ch/conf/mfcc.conf similarity index 100% rename from egs/timit/s4/conf/mfcc.conf rename to egs/chime4/s5_2ch/conf/mfcc.conf diff --git a/egs/chime4/s5_2ch/local b/egs/chime4/s5_2ch/local new file mode 120000 index 00000000000..93f81ea6259 --- /dev/null +++ b/egs/chime4/s5_2ch/local @@ -0,0 +1 @@ +../s5_1ch/local \ No newline at end of file diff --git a/egs/chime4/s5_2ch/path.sh b/egs/chime4/s5_2ch/path.sh new file mode 100755 index 00000000000..2d17b17a84a --- /dev/null +++ b/egs/chime4/s5_2ch/path.sh @@ -0,0 +1,6 @@ +export KALDI_ROOT=`pwd`/../../.. +[ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh +export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH +[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1 +. $KALDI_ROOT/tools/config/common_path.sh +export LC_ALL=C diff --git a/egs/chime4/s5_2ch/run.sh b/egs/chime4/s5_2ch/run.sh new file mode 100755 index 00000000000..16d92723fdf --- /dev/null +++ b/egs/chime4/s5_2ch/run.sh @@ -0,0 +1,93 @@ +#!/bin/bash + +# Kaldi ASR baseline for the CHiME-4 Challenge (2ch track: 2 channel track) +# +# Copyright 2016 University of Sheffield (Jon Barker, Ricard Marxer) +# Inria (Emmanuel Vincent) +# Mitsubishi Electric Research Labs (Shinji Watanabe) +# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) + +. ./path.sh +. ./cmd.sh + +# Config: +stage=0 # resume training with --stage=N +flatstart=false + +. utils/parse_options.sh || exit 1; + +# Set bash to 'debug' mode, it will exit on : +# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands', +set -e +set -u +set -o pipefail + +#####check data and model paths################ +# Set a main root directory of the CHiME4 data +# If you use scripts distributed in the CHiME4 package, +chime4_data=`pwd`/../.. +# Otherwise, please specify it, e.g., +chime4_data=/db/laputa1/data/processed/public/CHiME4 +if [ ! -d $chime4_data ]; then + echo "$chime4_data does not exist. Please specify chime4 data root correctly" && exit 1 +fi +# Set a model directory for the CHiME4 data. +modeldir=$chime4_data/tools/ASR_models +for d in $modeldir $modeldir/data/{lang,lang_test_tgpr_5k,lang_test_5gkn_5k,lang_test_rnnlm_5k_h300,local} \ + $modeldir/exp/{tri3b_tr05_multi_noisy,tri4a_dnn_tr05_multi_noisy,tri4a_dnn_tr05_multi_noisy_smbr_i1lats}; do + [ ! -d ] && echo "$0: no such directory $d. specify models correctly or execute './run.sh --flatstart true' first" && exit 1; +done +#####check data and model paths finished####### + + +#####main program start################ +# You can execute run_init.sh only "once" +# This creates 3-gram LM, FSTs, and basic task files +if [ $stage -le 0 ] && $flatstart; then + local/run_init.sh $chime4_data +fi + +# Using Beamformit +# See Hori et al, "The MERL/SRI system for the 3rd CHiME challenge using beamforming, +# robust feature extraction, and advanced speech recognition," in Proc. ASRU'15 +# note that beamformed wav files are generated in the following directory +enhancement_method=beamformit_2mics +enhancement_data=`pwd`/enhan/$enhancement_method +if [ $stage -le 1 ]; then + local/run_beamform_2ch_track.sh --cmd "$train_cmd" --nj 20 $chime4_data/data/audio/16kHz/isolated_2ch_track $enhancement_data +fi + +# GMM based ASR experiment without "retraining" +# Please set a directory of your speech enhancement method. +# run_gmm_recog.sh can be done every time when you change a speech enhancement technique. +# The directory structure and audio files must follow the attached baseline enhancement directory +if [ $stage -le 2 ]; then + if $flatstart; then + local/run_gmm.sh $enhancement_method $enhancement_data $chime4_data + else + local/run_gmm_recog.sh $enhancement_method $enhancement_data $modeldir + fi +fi + +# DNN based ASR experiment +# Since it takes time to evaluate DNN, we make the GMM and DNN scripts separately. +# You may execute it after you would have promising results using GMM-based ASR experiments +if [ $stage -le 3 ]; then + if $flatstart; then + local/run_dnn.sh $enhancement_method + else + local/run_dnn_recog.sh $enhancement_method $modeldir + fi +fi + +# LM-rescoring experiment with 5-gram and RNN LMs +# It takes a few days to train a RNNLM. +if [ $stage -le 4 ]; then + if $flatstart; then + local/run_lmrescore.sh $chime4_data $enhancement_method + else + local/run_lmrescore_recog.sh $enhancement_method $modeldir + fi +fi + +echo "Done." diff --git a/egs/chime4/s5_2ch/steps b/egs/chime4/s5_2ch/steps new file mode 120000 index 00000000000..6e99bf5b5ad --- /dev/null +++ b/egs/chime4/s5_2ch/steps @@ -0,0 +1 @@ +../../wsj/s5/steps \ No newline at end of file diff --git a/egs/chime4/s5_2ch/utils b/egs/chime4/s5_2ch/utils new file mode 120000 index 00000000000..b240885218f --- /dev/null +++ b/egs/chime4/s5_2ch/utils @@ -0,0 +1 @@ +../../wsj/s5/utils \ No newline at end of file diff --git a/egs/chime4/s5_6ch/RESULTS b/egs/chime4/s5_6ch/RESULTS new file mode 100644 index 00000000000..533edc2704e --- /dev/null +++ b/egs/chime4/s5_6ch/RESULTS @@ -0,0 +1,48 @@ +# CHiME-4 6ch track results +# The result is based on Hori et al, "The MERL/SRI system for the 3rd CHiME challenge using beamforming, +# robust feature extraction, and advanced speech recognition," in Proc. ASRU'15, +# and please refer the paper if you think the baseline useful. +# Note that the following result is different from that in the paper since we don't include +# SRI's robust features and system combination + +GMM noisy multi-condition with beamformit +exp/tri3b_tr05_multi_noisy/best_wer_beamformit_5mics.result +------------------- +best overall dt05 WER 13.67% (language model weight = 11) +------------------- +dt05_simu WER: 14.30% (Average), 12.80% (BUS), 17.05% (CAFE), 11.90% (PEDESTRIAN), 15.46% (STREET) +------------------- +dt05_real WER: 13.03% (Average), 16.03% (BUS), 12.80% (CAFE), 10.02% (PEDESTRIAN), 13.27% (STREET) +------------------- + +DNN sMBR +exp/tri4a_dnn_tr05_multi_noisy_smbr_i1lats/best_wer_beamformit_5mics.result +------------------- +best overall dt05 WER 8.60% (language model weight = 11) + (Number of iterations = 4) +------------------- +dt05_simu WER: 9.07% (Average), 8.44% (BUS), 10.63% (CAFE), 7.39% (PEDESTRIAN), 9.82% (STREET) +------------------- +dt05_real WER: 8.14% (Average), 10.22% (BUS), 8.19% (CAFE), 5.69% (PEDESTRIAN), 8.45% (STREET) +------------------- + +5-gram rescoring +exp/tri4a_dnn_tr05_multi_noisy_smbr_lmrescore/best_wer_beamformit_5mics_5gkn_5k.result +------------------- +best overall dt05 WER 7.30% (language model weight = 11) +------------------- +dt05_simu WER: 7.75% (Average), 7.14% (BUS), 9.13% (CAFE), 6.33% (PEDESTRIAN), 8.41% (STREET) +------------------- +dt05_real WER: 6.85% (Average), 8.53% (BUS), 6.90% (CAFE), 4.72% (PEDESTRIAN), 7.24% (STREET) +------------------- + +RNNLM +exp/tri4a_dnn_tr05_multi_noisy_smbr_lmrescore/best_wer_beamformit_5mics_rnnlm_5k_h300_w0.5_n100.result +------------------- +best overall dt05 WER 6.27% (language model weight = 12) +------------------- +dt05_simu WER: 6.77% (Average), 6.02% (BUS), 8.10% (CAFE), 5.49% (PEDESTRIAN), 7.48% (STREET) +------------------- +dt05_real WER: 5.76% (Average), 7.39% (BUS), 5.77% (CAFE), 3.72% (PEDESTRIAN), 6.18% (STREET) +------------------- + diff --git a/egs/chime4/s5_6ch/cmd.sh b/egs/chime4/s5_6ch/cmd.sh new file mode 100755 index 00000000000..2626a1a35b2 --- /dev/null +++ b/egs/chime4/s5_6ch/cmd.sh @@ -0,0 +1,21 @@ +# you can change cmd.sh depending on what type of queue you are using. +# If you have no queueing system and want to run on a local machine, you +# can change all instances 'queue.pl' to run.pl (but be careful and run +# commands one by one: most recipes will exhaust the memory on your +# machine). queue.pl works with GridEngine (qsub). slurm.pl works +# with slurm. Different queues are configured differently, with different +# queue names and different ways of specifying things like memory; +# to account for these differences you can create and edit the file +# conf/queue.conf to match your queue's configuration. Search for +# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, +# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. + +#export train_cmd="queue.pl --mem 2G" +#export decode_cmd="queue.pl --mem 4G" +#export mkgraph_cmd="queue.pl --mem 8G" + +# run it locally... +export train_cmd=run.pl +export decode_cmd=run.pl +export cuda_cmd=run.pl +export mkgraph_cmd=run.pl diff --git a/egs/chime4/s5_6ch/conf/chime4.cfg b/egs/chime4/s5_6ch/conf/chime4.cfg new file mode 100755 index 00000000000..70fdd858651 --- /dev/null +++ b/egs/chime4/s5_6ch/conf/chime4.cfg @@ -0,0 +1,50 @@ +#BeamformIt sample configuration file for AMI data (http://groups.inf.ed.ac.uk/ami/download/) + +# scrolling size to compute the delays +scroll_size = 250 + +# cross correlation computation window size +window_size = 500 + +#amount of maximum points for the xcorrelation taken into account +nbest_amount = 4 + +#flag wether to apply an automatic noise thresholding +do_noise_threshold = 1 + +#Percentage of frames with lower xcorr taken as noisy +noise_percent = 10 + +######## acoustic modelling parameters + +#transition probabilities weight for multichannel decoding +trans_weight_multi = 25 +trans_weight_nbest = 25 + +### + +#flag wether to print the feaures after setting them, or not +print_features = 1 + +#flag wether to use the bad frames in the sum process +do_avoid_bad_frames = 1 + +#flag to use the best channel (SNR) as a reference +#defined from command line +do_compute_reference = 1 + +#flag wether to use a uem file or not(process all the file) +do_use_uem_file = 0 + +#flag wether to use an adaptative weights scheme or fixed weights +do_adapt_weights = 1 + +#flag wether to output the sph files or just run the system to create the auxiliary files +do_write_sph_files = 1 + +####directories where to store/retrieve info#### +#channels_file = ./cfg-files/channels + +#show needs to be passed as argument normally, here a default one is given just in case +#show_id = Ttmp + diff --git a/egs/chime4/s5_6ch/conf/decode_dnn.config b/egs/chime4/s5_6ch/conf/decode_dnn.config new file mode 100644 index 00000000000..89dd9929a62 --- /dev/null +++ b/egs/chime4/s5_6ch/conf/decode_dnn.config @@ -0,0 +1,2 @@ +beam=18.0 # beam for decoding. Was 13.0 in the scripts. +lattice_beam=10.0 # this has most effect on size of the lattices. diff --git a/egs/chime4/s5_6ch/conf/fbank.conf b/egs/chime4/s5_6ch/conf/fbank.conf new file mode 100644 index 00000000000..5fc7774b31f --- /dev/null +++ b/egs/chime4/s5_6ch/conf/fbank.conf @@ -0,0 +1,11 @@ +# No non-default options for now. +--window-type=hamming # disable Dans window, use the standard +--use-energy=false # only fbank outputs +--sample-frequency=16000 # Cantonese is sampled at 8kHz + +--low-freq=64 # typical setup from Frantisek Grezl +--high-freq=8000 +--dither=1 + +--num-mel-bins=40 # 8kHz so we use 15 bins +--htk-compat=true # try to make it compatible with HTK diff --git a/egs/chime4/s5_6ch/conf/mfcc.conf b/egs/chime4/s5_6ch/conf/mfcc.conf new file mode 100644 index 00000000000..7361509099f --- /dev/null +++ b/egs/chime4/s5_6ch/conf/mfcc.conf @@ -0,0 +1 @@ +--use-energy=false # only non-default option. diff --git a/egs/chime4/s5_6ch/local b/egs/chime4/s5_6ch/local new file mode 120000 index 00000000000..93f81ea6259 --- /dev/null +++ b/egs/chime4/s5_6ch/local @@ -0,0 +1 @@ +../s5_1ch/local \ No newline at end of file diff --git a/egs/chime4/s5_6ch/path.sh b/egs/chime4/s5_6ch/path.sh new file mode 100755 index 00000000000..2d17b17a84a --- /dev/null +++ b/egs/chime4/s5_6ch/path.sh @@ -0,0 +1,6 @@ +export KALDI_ROOT=`pwd`/../../.. +[ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh +export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH +[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1 +. $KALDI_ROOT/tools/config/common_path.sh +export LC_ALL=C diff --git a/egs/chime4/s5_6ch/run.sh b/egs/chime4/s5_6ch/run.sh new file mode 100755 index 00000000000..d5a8b871a07 --- /dev/null +++ b/egs/chime4/s5_6ch/run.sh @@ -0,0 +1,93 @@ +#!/bin/bash + +# Kaldi ASR baseline for the CHiME-4 Challenge (6ch track: 6 channel track) +# +# Copyright 2016 University of Sheffield (Jon Barker, Ricard Marxer) +# Inria (Emmanuel Vincent) +# Mitsubishi Electric Research Labs (Shinji Watanabe) +# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) + +. ./path.sh +. ./cmd.sh + +# Config: +stage=0 # resume training with --stage=N +flatstart=false + +. utils/parse_options.sh || exit 1; + +# Set bash to 'debug' mode, it will exit on : +# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands', +set -e +set -u +set -o pipefail + +#####check data and model paths################ +# Set a main root directory of the CHiME4 data +# If you use scripts distributed in the CHiME4 package, +chime4_data=`pwd`/../.. +# Otherwise, please specify it, e.g., +chime4_data=/db/laputa1/data/processed/public/CHiME4 +if [ ! -d $chime4_data ]; then + echo "$chime4_data does not exist. Please specify chime4 data root correctly" && exit 1 +fi +# Set a model directory for the CHiME4 data. +modeldir=$chime4_data/tools/ASR_models +for d in $modeldir $modeldir/data/{lang,lang_test_tgpr_5k,lang_test_5gkn_5k,lang_test_rnnlm_5k_h300,local} \ + $modeldir/exp/{tri3b_tr05_multi_noisy,tri4a_dnn_tr05_multi_noisy,tri4a_dnn_tr05_multi_noisy_smbr_i1lats}; do + [ ! -d ] && echo "$0: no such directory $d. specify models correctly or execute './run.sh --flatstart true' first" && exit 1; +done +#####check data and model paths finished####### + + +#####main program start################ +# You can execute run_init.sh only "once" +# This creates 3-gram LM, FSTs, and basic task files +if [ $stage -le 0 ] && $flatstart; then + local/run_init.sh $chime4_data +fi + +# Using Beamformit +# See Hori et al, "The MERL/SRI system for the 3rd CHiME challenge using beamforming, +# robust feature extraction, and advanced speech recognition," in Proc. ASRU'15 +# note that beamformed wav files are generated in the following directory +enhancement_method=beamformit_5mics +enhancement_data=`pwd`/enhan/$enhancement_method +if [ $stage -le 1 ]; then + local/run_beamform_6ch_track.sh --cmd "$train_cmd" --nj 20 $chime4_data/data/audio/16kHz/isolated_6ch_track $enhancement_data +fi + +# GMM based ASR experiment without "retraining" +# Please set a directory of your speech enhancement method. +# run_gmm_recog.sh can be done every time when you change a speech enhancement technique. +# The directory structure and audio files must follow the attached baseline enhancement directory +if [ $stage -le 2 ]; then + if $flatstart; then + local/run_gmm.sh $enhancement_method $enhancement_data $chime4_data + else + local/run_gmm_recog.sh $enhancement_method $enhancement_data $modeldir + fi +fi + +# DNN based ASR experiment +# Since it takes time to evaluate DNN, we make the GMM and DNN scripts separately. +# You may execute it after you would have promising results using GMM-based ASR experiments +if [ $stage -le 3 ]; then + if $flatstart; then + local/run_dnn.sh $enhancement_method + else + local/run_dnn_recog.sh $enhancement_method $modeldir + fi +fi + +# LM-rescoring experiment with 5-gram and RNN LMs +# It takes a few days to train a RNNLM. +if [ $stage -le 4 ]; then + if $flatstart; then + local/run_lmrescore.sh $chime4_data $enhancement_method + else + local/run_lmrescore_recog.sh $enhancement_method $modeldir + fi +fi + +echo "Done." diff --git a/egs/chime4/s5_6ch/steps b/egs/chime4/s5_6ch/steps new file mode 120000 index 00000000000..6e99bf5b5ad --- /dev/null +++ b/egs/chime4/s5_6ch/steps @@ -0,0 +1 @@ +../../wsj/s5/steps \ No newline at end of file diff --git a/egs/chime4/s5_6ch/utils b/egs/chime4/s5_6ch/utils new file mode 120000 index 00000000000..b240885218f --- /dev/null +++ b/egs/chime4/s5_6ch/utils @@ -0,0 +1 @@ +../../wsj/s5/utils \ No newline at end of file diff --git a/egs/csj/README.txt b/egs/csj/README.txt index 9683c8b543c..268a313b458 100644 --- a/egs/csj/README.txt +++ b/egs/csj/README.txt @@ -1,16 +1,27 @@ About the Corpus of Spontaneous Japanese: The Corpus of Spontaneous Japanese (CSJ) is a database of spoken -Japanese developed by the Japan's national priority area research +Japanese developed by the Japan's national priority area research project "Spontaneous Speech: Corpus and Processing Technology". -It contains about 650 hours of speech consisting of approximately +It contains about 650 hours of speech consisting of approximately 7.5 million words that were provided by more than 1,400 speakers. -For more details about the corpus, please visit the website of the +For more details about the corpus, please visit the website of the National Institute for Japanese Language (NINJAL). It is available from the Institute. http://www.ninjal.ac.jp/english/products/csj/ http://pj.ninjal.ac.jp/corpus_center/csj/ +Meta-parameter tuning based on evolution strategy: +The meta-parameters of the system contained in conf/config_opt were +automatically tuned using evolution strategy. For the details, +please refer the following paper: +Takafumi Moriya, Tomohiro Tanaka, Takahiro Shinozaki, Shinji Watanabe, +and Kevin Duh, "Automation of System Building for State-of-the-art +Large Vocabulary Speech Recognition Using Evolution Strategy," Proc. +IEEE 2015 Automatic Speech Recognition and Understanding Workshop +(ASRU), 2015. + + Each subdirectory of this directory contains the -scripts for a sequence of experiments. +scripts for a sequence of experiments. s5: This is the current recommended recipe. - The third edition of CSJ is assumed. + The recipe supports the third and fourth editions of CSJ. diff --git a/egs/csj/s5/RESULTS b/egs/csj/s5/RESULTS index 208d99b8d66..340879aeda7 100644 --- a/egs/csj/s5/RESULTS +++ b/egs/csj/s5/RESULTS @@ -1,117 +1,118 @@ +## These are results using the third edition of CSJ. for eval_num in `seq 3`; do echo "=== evaluation set $eval_num ===" ;\ for x in exp/{tri,dnn}*/decode_eval${eval_num}*; do [ -d $x ] && grep WER $x/wer_* | utils/best_wer.sh; done ; done ## Results of using training set that contains "academic" and "other" speech data (default). -## If you want to use "trial lecture" and "dialog" data, you should check the following script [local/csj_data_prep.sh line 44]. +## If you want to use "simulated public speaking" and "dialog" data, you should check the following script [local/csj_data_prep.sh line 44]. === evaluation set 1 === -%WER 22.67 [ 6269 / 27651, 522 ins, 1903 del, 3844 sub ] exp/tri1/decode_eval1_csj/wer_12 -%WER 21.49 [ 5943 / 27651, 541 ins, 1745 del, 3657 sub ] exp/tri2/decode_eval1_csj/wer_12 -%WER 17.49 [ 4837 / 27651, 613 ins, 1269 del, 2955 sub ] exp/tri3/decode_eval1_csj/wer_16 -%WER 15.26 [ 4220 / 27651, 566 ins, 1071 del, 2583 sub ] exp/tri4/decode_eval1_csj/wer_17 -%WER 17.33 [ 4792 / 27651, 628 ins, 1137 del, 3027 sub ] exp/tri4/decode_eval1_csj.si/wer_16 -%WER 14.59 [ 4033 / 27651, 617 ins, 919 del, 2497 sub ] exp/tri4_fmmi_b0.1/decode_eval1_it4_csj/wer_14 -%WER 14.14 [ 3911 / 27651, 585 ins, 915 del, 2411 sub ] exp/tri4_fmmi_b0.1/decode_eval1_it5_csj/wer_17 -%WER 14.00 [ 3871 / 27651, 586 ins, 888 del, 2397 sub ] exp/tri4_fmmi_b0.1/decode_eval1_it6_csj/wer_15 -%WER 13.92 [ 3850 / 27651, 661 ins, 793 del, 2396 sub ] exp/tri4_fmmi_b0.1/decode_eval1_it7_csj/wer_14 -%WER 14.15 [ 3913 / 27651, 640 ins, 877 del, 2396 sub ] exp/tri4_fmmi_b0.1/decode_eval1_it8_csj/wer_14 -%WER 14.39 [ 3979 / 27651, 570 ins, 946 del, 2463 sub ] exp/tri4_mmi_b0.1/decode_eval1_1.mdl_csj/wer_17 -%WER 14.09 [ 3895 / 27651, 576 ins, 882 del, 2437 sub ] exp/tri4_mmi_b0.1/decode_eval1_2.mdl_csj/wer_15 -%WER 14.02 [ 3877 / 27651, 602 ins, 858 del, 2417 sub ] exp/tri4_mmi_b0.1/decode_eval1_3.mdl_csj/wer_15 -%WER 14.00 [ 3870 / 27651, 609 ins, 853 del, 2408 sub ] exp/tri4_mmi_b0.1/decode_eval1_4.mdl_csj/wer_15 -%WER 11.93 [ 3298 / 27651, 348 ins, 970 del, 1980 sub ] exp/dnn5b_pretrain-dbn_dnn/decode_eval1_csj/wer_12 -%WER 11.29 [ 3123 / 27651, 509 ins, 651 del, 1963 sub ] exp/dnn5b_pretrain-dbn_dnn_smbr/decode_eval1_csj/wer_15 -%WER 10.87 [ 3007 / 27651, 497 ins, 589 del, 1921 sub ] exp/dnn5b_pretrain-dbn_dnn_smbr_i1lats/decode_eval1_csj/wer_14 +%WER 22.67 [ 6269 / 27651, 522 ins, 1903 del, 3844 sub ] exp/tri1/decode_eval1_csj/wer_12_0.0 +%WER 21.49 [ 5943 / 27651, 541 ins, 1745 del, 3657 sub ] exp/tri2/decode_eval1_csj/wer_12_0.0 +%WER 17.49 [ 4837 / 27651, 613 ins, 1269 del, 2955 sub ] exp/tri3/decode_eval1_csj/wer_16_0.0 +%WER 15.26 [ 4220 / 27651, 566 ins, 1071 del, 2583 sub ] exp/tri4/decode_eval1_csj/wer_17_0.0 +%WER 17.33 [ 4792 / 27651, 628 ins, 1137 del, 3027 sub ] exp/tri4/decode_eval1_csj.si/wer_16_0.0 +%WER 14.59 [ 4033 / 27651, 617 ins, 919 del, 2497 sub ] exp/tri4_fmmi_b0.1/decode_eval1_it4_csj/wer_14_0.0 +%WER 14.14 [ 3911 / 27651, 585 ins, 915 del, 2411 sub ] exp/tri4_fmmi_b0.1/decode_eval1_it5_csj/wer_17_0.0 +%WER 14.00 [ 3871 / 27651, 586 ins, 888 del, 2397 sub ] exp/tri4_fmmi_b0.1/decode_eval1_it6_csj/wer_15_0.5 +%WER 13.92 [ 3850 / 27651, 661 ins, 793 del, 2396 sub ] exp/tri4_fmmi_b0.1/decode_eval1_it7_csj/wer_14_0.5 +%WER 14.15 [ 3913 / 27651, 640 ins, 877 del, 2396 sub ] exp/tri4_fmmi_b0.1/decode_eval1_it8_csj/wer_14_0.5 +%WER 14.39 [ 3979 / 27651, 570 ins, 946 del, 2463 sub ] exp/tri4_mmi_b0.1/decode_eval1_1.mdl_csj/wer_17_0.5 +%WER 14.09 [ 3895 / 27651, 576 ins, 882 del, 2437 sub ] exp/tri4_mmi_b0.1/decode_eval1_2.mdl_csj/wer_15_0.0 +%WER 14.02 [ 3877 / 27651, 602 ins, 858 del, 2417 sub ] exp/tri4_mmi_b0.1/decode_eval1_3.mdl_csj/wer_15_0.5 +%WER 14.00 [ 3870 / 27651, 609 ins, 853 del, 2408 sub ] exp/tri4_mmi_b0.1/decode_eval1_4.mdl_csj/wer_15_0.5 +%WER 11.93 [ 3298 / 27651, 348 ins, 970 del, 1980 sub ] exp/dnn5b_pretrain-dbn_dnn/decode_eval1_csj/wer_12_0.0 +%WER 11.29 [ 3123 / 27651, 509 ins, 651 del, 1963 sub ] exp/dnn5b_pretrain-dbn_dnn_smbr/decode_eval1_csj/wer_15_1.0 +%WER 10.87 [ 3007 / 27651, 497 ins, 589 del, 1921 sub ] exp/dnn5b_pretrain-dbn_dnn_smbr_i1lats/decode_eval1_csj/wer_14_0.5 === evaluation set 2 === -%WER 19.80 [ 5628 / 28424, 561 ins, 1511 del, 3556 sub ] exp/tri1/decode_eval2_csj/wer_12 -%WER 19.04 [ 5413 / 28424, 600 ins, 1423 del, 3390 sub ] exp/tri2/decode_eval2_csj/wer_12 -%WER 15.80 [ 4490 / 28424, 582 ins, 1131 del, 2777 sub ] exp/tri3/decode_eval2_csj/wer_16 -%WER 13.95 [ 3964 / 28424, 691 ins, 843 del, 2430 sub ] exp/tri4/decode_eval2_csj/wer_13 -%WER 18.74 [ 5326 / 28424, 804 ins, 1056 del, 3466 sub ] exp/tri4/decode_eval2_csj.si/wer_17 -%WER 12.77 [ 3631 / 28424, 604 ins, 781 del, 2246 sub ] exp/tri4_fmmi_b0.1/decode_eval2_it4_csj/wer_14 -%WER 12.27 [ 3488 / 28424, 604 ins, 707 del, 2177 sub ] exp/tri4_fmmi_b0.1/decode_eval2_it5_csj/wer_14 -%WER 12.32 [ 3502 / 28424, 613 ins, 713 del, 2176 sub ] exp/tri4_fmmi_b0.1/decode_eval2_it6_csj/wer_13 -%WER 12.32 [ 3502 / 28424, 658 ins, 688 del, 2156 sub ] exp/tri4_fmmi_b0.1/decode_eval2_it7_csj/wer_14 -%WER 12.56 [ 3569 / 28424, 642 ins, 760 del, 2167 sub ] exp/tri4_fmmi_b0.1/decode_eval2_it8_csj/wer_13 -%WER 12.51 [ 3557 / 28424, 588 ins, 766 del, 2203 sub ] exp/tri4_mmi_b0.1/decode_eval2_1.mdl_csj/wer_15 -%WER 12.25 [ 3482 / 28424, 587 ins, 730 del, 2165 sub ] exp/tri4_mmi_b0.1/decode_eval2_2.mdl_csj/wer_14 -%WER 12.20 [ 3467 / 28424, 599 ins, 706 del, 2162 sub ] exp/tri4_mmi_b0.1/decode_eval2_3.mdl_csj/wer_14 -%WER 12.33 [ 3504 / 28424, 615 ins, 714 del, 2175 sub ] exp/tri4_mmi_b0.1/decode_eval2_4.mdl_csj/wer_14 -%WER 10.24 [ 2910 / 28424, 271 ins, 852 del, 1787 sub ] exp/dnn5b_pretrain-dbn_dnn/decode_eval2_csj/wer_12 -%WER 9.41 [ 2676 / 28424, 453 ins, 432 del, 1791 sub ] exp/dnn5b_pretrain-dbn_dnn_smbr/decode_eval2_csj/wer_14 -%WER 9.19 [ 2612 / 28424, 417 ins, 422 del, 1773 sub ] exp/dnn5b_pretrain-dbn_dnn_smbr_i1lats/decode_eval2_csj/wer_14 +%WER 19.80 [ 5628 / 28424, 561 ins, 1511 del, 3556 sub ] exp/tri1/decode_eval2_csj/wer_12_0.0 +%WER 19.04 [ 5413 / 28424, 600 ins, 1423 del, 3390 sub ] exp/tri2/decode_eval2_csj/wer_12_0.0 +%WER 15.80 [ 4490 / 28424, 582 ins, 1131 del, 2777 sub ] exp/tri3/decode_eval2_csj/wer_16_0.5 +%WER 13.95 [ 3964 / 28424, 691 ins, 843 del, 2430 sub ] exp/tri4/decode_eval2_csj/wer_13_0.0 +%WER 18.74 [ 5326 / 28424, 804 ins, 1056 del, 3466 sub ] exp/tri4/decode_eval2_csj.si/wer_17_0.0 +%WER 12.77 [ 3631 / 28424, 604 ins, 781 del, 2246 sub ] exp/tri4_fmmi_b0.1/decode_eval2_it4_csj/wer_14_0.5 +%WER 12.27 [ 3488 / 28424, 604 ins, 707 del, 2177 sub ] exp/tri4_fmmi_b0.1/decode_eval2_it5_csj/wer_14_0.5 +%WER 12.32 [ 3502 / 28424, 613 ins, 713 del, 2176 sub ] exp/tri4_fmmi_b0.1/decode_eval2_it6_csj/wer_13_0.5 +%WER 12.32 [ 3502 / 28424, 658 ins, 688 del, 2156 sub ] exp/tri4_fmmi_b0.1/decode_eval2_it7_csj/wer_14_1.0 +%WER 12.56 [ 3569 / 28424, 642 ins, 760 del, 2167 sub ] exp/tri4_fmmi_b0.1/decode_eval2_it8_csj/wer_13_1.0 +%WER 12.51 [ 3557 / 28424, 588 ins, 766 del, 2203 sub ] exp/tri4_mmi_b0.1/decode_eval2_1.mdl_csj/wer_15_0.5 +%WER 12.25 [ 3482 / 28424, 587 ins, 730 del, 2165 sub ] exp/tri4_mmi_b0.1/decode_eval2_2.mdl_csj/wer_14_0.5 +%WER 12.20 [ 3467 / 28424, 599 ins, 706 del, 2162 sub ] exp/tri4_mmi_b0.1/decode_eval2_3.mdl_csj/wer_14_0.5 +%WER 12.33 [ 3504 / 28424, 615 ins, 714 del, 2175 sub ] exp/tri4_mmi_b0.1/decode_eval2_4.mdl_csj/wer_14_0.5 +%WER 10.24 [ 2910 / 28424, 271 ins, 852 del, 1787 sub ] exp/dnn5b_pretrain-dbn_dnn/decode_eval2_csj/wer_12_0.0 +%WER 9.41 [ 2676 / 28424, 453 ins, 432 del, 1791 sub ] exp/dnn5b_pretrain-dbn_dnn_smbr/decode_eval2_csj/wer_14_1.0 +%WER 9.19 [ 2612 / 28424, 417 ins, 422 del, 1773 sub ] exp/dnn5b_pretrain-dbn_dnn_smbr_i1lats/decode_eval2_csj/wer_14_0.5 === evaluation set 3 === -%WER 24.80 [ 4534 / 18283, 447 ins, 1350 del, 2737 sub ] exp/tri1/decode_eval3_csj/wer_15 -%WER 23.68 [ 4329 / 18283, 497 ins, 1183 del, 2649 sub ] exp/tri2/decode_eval3_csj/wer_13 -%WER 19.97 [ 3651 / 18283, 582 ins, 828 del, 2241 sub ] exp/tri3/decode_eval3_csj/wer_17 -%WER 17.27 [ 3158 / 18283, 520 ins, 752 del, 1886 sub ] exp/tri4/decode_eval3_csj/wer_19 -%WER 21.44 [ 3919 / 18283, 660 ins, 823 del, 2436 sub ] exp/tri4/decode_eval3_csj.si/wer_20 -%WER 16.56 [ 3028 / 18283, 476 ins, 716 del, 1836 sub ] exp/tri4_fmmi_b0.1/decode_eval3_it4_csj/wer_20 -%WER 15.79 [ 2887 / 18283, 547 ins, 554 del, 1786 sub ] exp/tri4_fmmi_b0.1/decode_eval3_it5_csj/wer_15 -%WER 15.89 [ 2906 / 18283, 519 ins, 597 del, 1790 sub ] exp/tri4_fmmi_b0.1/decode_eval3_it6_csj/wer_15 -%WER 15.64 [ 2860 / 18283, 556 ins, 512 del, 1792 sub ] exp/tri4_fmmi_b0.1/decode_eval3_it7_csj/wer_15 -%WER 16.38 [ 2994 / 18283, 529 ins, 655 del, 1810 sub ] exp/tri4_fmmi_b0.1/decode_eval3_it8_csj/wer_15 -%WER 16.13 [ 2949 / 18283, 505 ins, 630 del, 1814 sub ] exp/tri4_mmi_b0.1/decode_eval3_1.mdl_csj/wer_18 -%WER 15.97 [ 2920 / 18283, 540 ins, 556 del, 1824 sub ] exp/tri4_mmi_b0.1/decode_eval3_2.mdl_csj/wer_14 -%WER 15.98 [ 2922 / 18283, 564 ins, 537 del, 1821 sub ] exp/tri4_mmi_b0.1/decode_eval3_3.mdl_csj/wer_14 -%WER 15.98 [ 2921 / 18283, 548 ins, 566 del, 1807 sub ] exp/tri4_mmi_b0.1/decode_eval3_4.mdl_csj/wer_15 -%WER 13.94 [ 2548 / 18283, 313 ins, 716 del, 1519 sub ] exp/dnn5b_pretrain-dbn_dnn/decode_eval3_csj/wer_13 -%WER 12.52 [ 2289 / 18283, 464 ins, 354 del, 1471 sub ] exp/dnn5b_pretrain-dbn_dnn_smbr/decode_eval3_csj/wer_15 -%WER 12.18 [ 2226 / 18283, 431 ins, 340 del, 1455 sub ] exp/dnn5b_pretrain-dbn_dnn_smbr_i1lats/decode_eval3_csj/wer_15 +%WER 24.80 [ 4534 / 18283, 447 ins, 1350 del, 2737 sub ] exp/tri1/decode_eval3_csj/wer_15_0.0 +%WER 23.68 [ 4329 / 18283, 497 ins, 1183 del, 2649 sub ] exp/tri2/decode_eval3_csj/wer_13_0.0 +%WER 19.97 [ 3651 / 18283, 582 ins, 828 del, 2241 sub ] exp/tri3/decode_eval3_csj/wer_17_0.5 +%WER 17.27 [ 3158 / 18283, 520 ins, 752 del, 1886 sub ] exp/tri4/decode_eval3_csj/wer_19_0.0 +%WER 21.44 [ 3919 / 18283, 660 ins, 823 del, 2436 sub ] exp/tri4/decode_eval3_csj.si/wer_20_1.0 +%WER 16.56 [ 3028 / 18283, 476 ins, 716 del, 1836 sub ] exp/tri4_fmmi_b0.1/decode_eval3_it4_csj/wer_20_0.0 +%WER 15.79 [ 2887 / 18283, 547 ins, 554 del, 1786 sub ] exp/tri4_fmmi_b0.1/decode_eval3_it5_csj/wer_15_0.5 +%WER 15.89 [ 2906 / 18283, 519 ins, 597 del, 1790 sub ] exp/tri4_fmmi_b0.1/decode_eval3_it6_csj/wer_15_0.5 +%WER 15.64 [ 2860 / 18283, 556 ins, 512 del, 1792 sub ] exp/tri4_fmmi_b0.1/decode_eval3_it7_csj/wer_15_1.0 +%WER 16.38 [ 2994 / 18283, 529 ins, 655 del, 1810 sub ] exp/tri4_fmmi_b0.1/decode_eval3_it8_csj/wer_15_0.5 +%WER 16.13 [ 2949 / 18283, 505 ins, 630 del, 1814 sub ] exp/tri4_mmi_b0.1/decode_eval3_1.mdl_csj/wer_18_0.0 +%WER 15.97 [ 2920 / 18283, 540 ins, 556 del, 1824 sub ] exp/tri4_mmi_b0.1/decode_eval3_2.mdl_csj/wer_14_0.5 +%WER 15.98 [ 2922 / 18283, 564 ins, 537 del, 1821 sub ] exp/tri4_mmi_b0.1/decode_eval3_3.mdl_csj/wer_14_0.0 +%WER 15.98 [ 2921 / 18283, 548 ins, 566 del, 1807 sub ] exp/tri4_mmi_b0.1/decode_eval3_4.mdl_csj/wer_15_1.0 +%WER 13.94 [ 2548 / 18283, 313 ins, 716 del, 1519 sub ] exp/dnn5b_pretrain-dbn_dnn/decode_eval3_csj/wer_13_0.0 +%WER 12.52 [ 2289 / 18283, 464 ins, 354 del, 1471 sub ] exp/dnn5b_pretrain-dbn_dnn_smbr/decode_eval3_csj/wer_15_0.0 +%WER 12.18 [ 2226 / 18283, 431 ins, 340 del, 1455 sub ] exp/dnn5b_pretrain-dbn_dnn_smbr_i1lats/decode_eval3_csj/wer_15_0.5 -## Results of using training data that contain all types of speech data. +## Results of using training data that contain all types of speech data except for dialog type. === evaluation set 1 === -%WER 22.71 [ 6279 / 27651, 524 ins, 1936 del, 3819 sub ] exp/tri1/decode_eval1_csj/wer_13 -%WER 21.36 [ 5905 / 27651, 529 ins, 1781 del, 3595 sub ] exp/tri2/decode_eval1_csj/wer_13 -%WER 17.89 [ 4948 / 27651, 586 ins, 1314 del, 3048 sub ] exp/tri3/decode_eval1_csj/wer_16 -%WER 15.85 [ 4383 / 27651, 580 ins, 1169 del, 2634 sub ] exp/tri4/decode_eval1_csj/wer_17 -%WER 18.06 [ 4995 / 27651, 671 ins, 1209 del, 3115 sub ] exp/tri4/decode_eval1_csj.si/wer_15 -%WER 15.17 [ 4196 / 27651, 536 ins, 1105 del, 2555 sub ] exp/tri4_fmmi_b0.1/decode_eval1_it4_csj/wer_17 -%WER 14.32 [ 3959 / 27651, 578 ins, 949 del, 2432 sub ] exp/tri4_fmmi_b0.1/decode_eval1_it5_csj/wer_15 -%WER 14.20 [ 3926 / 27651, 598 ins, 885 del, 2443 sub ] exp/tri4_fmmi_b0.1/decode_eval1_it6_csj/wer_13 -%WER 13.93 [ 3851 / 27651, 631 ins, 829 del, 2391 sub ] exp/tri4_fmmi_b0.1/decode_eval1_it7_csj/wer_14 -%WER 14.09 [ 3895 / 27651, 621 ins, 847 del, 2427 sub ] exp/tri4_fmmi_b0.1/decode_eval1_it8_csj/wer_12 -%WER 14.69 [ 4061 / 27651, 587 ins, 981 del, 2493 sub ] exp/tri4_mmi_b0.1/decode_eval1_1.mdl_csj/wer_15 -%WER 14.48 [ 4003 / 27651, 549 ins, 1001 del, 2453 sub ] exp/tri4_mmi_b0.1/decode_eval1_2.mdl_csj/wer_16 -%WER 14.33 [ 3963 / 27651, 611 ins, 901 del, 2451 sub ] exp/tri4_mmi_b0.1/decode_eval1_3.mdl_csj/wer_14 -%WER 14.12 [ 3905 / 27651, 610 ins, 870 del, 2425 sub ] exp/tri4_mmi_b0.1/decode_eval1_4.mdl_csj/wer_14 -%WER 11.62 [ 3214 / 27651, 381 ins, 799 del, 2034 sub ] exp/dnn5b_pretrain-dbn_dnn/decode_eval1_csj/wer_12 -%WER 10.93 [ 3021 / 27651, 475 ins, 566 del, 1980 sub ] exp/dnn5b_pretrain-dbn_dnn_smbr/decode_eval1_csj/wer_14 -%WER 10.71 [ 2962 / 27651, 516 ins, 496 del, 1950 sub ] exp/dnn5b_pretrain-dbn_dnn_smbr_i1lats/decode_eval1_csj/wer_13 +%WER 22.97 [ 6352 / 27651, 514 ins, 1941 del, 3897 sub ] exp/tri1/decode_eval1_csj/wer_13_0.0 +%WER 21.48 [ 5939 / 27651, 482 ins, 1885 del, 3572 sub ] exp/tri2/decode_eval1_csj/wer_14_0.0 +%WER 17.86 [ 4939 / 27651, 596 ins, 1305 del, 3038 sub ] exp/tri3/decode_eval1_csj/wer_15_0.0 +%WER 15.67 [ 4333 / 27651, 584 ins, 1121 del, 2628 sub ] exp/tri4/decode_eval1_csj/wer_16_0.0 +%WER 17.88 [ 4943 / 27651, 623 ins, 1226 del, 3094 sub ] exp/tri4/decode_eval1_csj.si/wer_16_0.0 +%WER 15.01 [ 4150 / 27651, 580 ins, 1009 del, 2561 sub ] exp/tri4_fmmi_b0.1/decode_eval1_it4_csj/wer_15_0.0 +%WER 14.28 [ 3949 / 27651, 578 ins, 929 del, 2442 sub ] exp/tri4_fmmi_b0.1/decode_eval1_it5_csj/wer_15_0.0 +%WER 14.17 [ 3917 / 27651, 542 ins, 966 del, 2409 sub ] exp/tri4_fmmi_b0.1/decode_eval1_it6_csj/wer_15_0.0 +%WER 14.00 [ 3871 / 27651, 442 ins, 1085 del, 2344 sub ] exp/tri4_fmmi_b0.1/decode_eval1_it7_csj/wer_12_1.0 +%WER 14.08 [ 3893 / 27651, 426 ins, 1087 del, 2380 sub ] exp/tri4_fmmi_b0.1/decode_eval1_it8_csj/wer_11_1.0 +%WER 14.60 [ 4036 / 27651, 458 ins, 1115 del, 2463 sub ] exp/tri4_mmi_b0.1/decode_eval1_1.mdl_csj/wer_15_0.5 +%WER 14.42 [ 3986 / 27651, 459 ins, 1081 del, 2446 sub ] exp/tri4_mmi_b0.1/decode_eval1_2.mdl_csj/wer_14_0.5 +%WER 14.22 [ 3931 / 27651, 492 ins, 1022 del, 2417 sub ] exp/tri4_mmi_b0.1/decode_eval1_3.mdl_csj/wer_13_0.5 +%WER 13.99 [ 3869 / 27651, 504 ins, 949 del, 2416 sub ] exp/tri4_mmi_b0.1/decode_eval1_4.mdl_csj/wer_12_0.5 +%WER 11.63 [ 3215 / 27651, 384 ins, 804 del, 2027 sub ] exp/dnn5b_pretrain-dbn_dnn/decode_eval1_csj/wer_12_0.0 +%WER 10.56 [ 2921 / 27651, 366 ins, 662 del, 1893 sub ] exp/dnn5b_pretrain-dbn_dnn_smbr/decode_eval1_csj/wer_13_1.0 +%WER 10.34 [ 2859 / 27651, 363 ins, 660 del, 1836 sub ] exp/dnn5b_pretrain-dbn_dnn_smbr_i1lats/decode_eval1_csj/wer_14_1.0 === evaluation set 2 === -%WER 19.61 [ 5575 / 28424, 577 ins, 1442 del, 3556 sub ] exp/tri1/decode_eval2_csj/wer_12 -%WER 18.47 [ 5250 / 28424, 572 ins, 1361 del, 3317 sub ] exp/tri2/decode_eval2_csj/wer_12 -%WER 15.71 [ 4464 / 28424, 577 ins, 1128 del, 2759 sub ] exp/tri3/decode_eval2_csj/wer_15 -%WER 13.24 [ 3764 / 28424, 535 ins, 921 del, 2308 sub ] exp/tri4/decode_eval2_csj/wer_16 -%WER 17.90 [ 5088 / 28424, 743 ins, 1057 del, 3288 sub ] exp/tri4/decode_eval2_csj.si/wer_16 -%WER 12.56 [ 3571 / 28424, 595 ins, 767 del, 2209 sub ] exp/tri4_fmmi_b0.1/decode_eval2_it4_csj/wer_13 -%WER 11.79 [ 3350 / 28424, 584 ins, 669 del, 2097 sub ] exp/tri4_fmmi_b0.1/decode_eval2_it5_csj/wer_13 -%WER 11.86 [ 3372 / 28424, 619 ins, 643 del, 2110 sub ] exp/tri4_fmmi_b0.1/decode_eval2_it6_csj/wer_11 -%WER 11.79 [ 3352 / 28424, 603 ins, 659 del, 2090 sub ] exp/tri4_fmmi_b0.1/decode_eval2_it7_csj/wer_13 -%WER 12.08 [ 3434 / 28424, 602 ins, 701 del, 2131 sub ] exp/tri4_fmmi_b0.1/decode_eval2_it8_csj/wer_11 -%WER 12.13 [ 3447 / 28424, 561 ins, 735 del, 2151 sub ] exp/tri4_mmi_b0.1/decode_eval2_1.mdl_csj/wer_14 -%WER 11.88 [ 3376 / 28424, 575 ins, 676 del, 2125 sub ] exp/tri4_mmi_b0.1/decode_eval2_2.mdl_csj/wer_12 -%WER 11.77 [ 3345 / 28424, 588 ins, 646 del, 2111 sub ] exp/tri4_mmi_b0.1/decode_eval2_3.mdl_csj/wer_12 -%WER 11.73 [ 3333 / 28424, 586 ins, 658 del, 2089 sub ] exp/tri4_mmi_b0.1/decode_eval2_4.mdl_csj/wer_12 -%WER 9.36 [ 2660 / 28424, 357 ins, 561 del, 1742 sub ] exp/dnn5b_pretrain-dbn_dnn/decode_eval2_csj/wer_10 -%WER 9.07 [ 2579 / 28424, 467 ins, 404 del, 1708 sub ] exp/dnn5b_pretrain-dbn_dnn_smbr/decode_eval2_csj/wer_13 -%WER 8.91 [ 2533 / 28424, 439 ins, 399 del, 1695 sub ] exp/dnn5b_pretrain-dbn_dnn_smbr_i1lats/decode_eval2_csj/wer_15 +%WER 19.56 [ 5560 / 28424, 560 ins, 1527 del, 3473 sub ] exp/tri1/decode_eval2_csj/wer_12_0.0 +%WER 18.62 [ 5293 / 28424, 610 ins, 1361 del, 3322 sub ] exp/tri2/decode_eval2_csj/wer_12_0.0 +%WER 15.58 [ 4429 / 28424, 626 ins, 1026 del, 2777 sub ] exp/tri3/decode_eval2_csj/wer_13_0.0 +%WER 13.37 [ 3801 / 28424, 643 ins, 844 del, 2314 sub ] exp/tri4/decode_eval2_csj/wer_14_0.0 +%WER 18.03 [ 5126 / 28424, 665 ins, 1178 del, 3283 sub ] exp/tri4/decode_eval2_csj.si/wer_15_0.5 +%WER 12.36 [ 3514 / 28424, 475 ins, 880 del, 2159 sub ] exp/tri4_fmmi_b0.1/decode_eval2_it4_csj/wer_13_0.5 +%WER 11.54 [ 3279 / 28424, 448 ins, 792 del, 2039 sub ] exp/tri4_fmmi_b0.1/decode_eval2_it5_csj/wer_13_0.5 +%WER 11.47 [ 3260 / 28424, 497 ins, 740 del, 2023 sub ] exp/tri4_fmmi_b0.1/decode_eval2_it6_csj/wer_11_0.5 +%WER 11.34 [ 3223 / 28424, 476 ins, 713 del, 2034 sub ] exp/tri4_fmmi_b0.1/decode_eval2_it7_csj/wer_10_1.0 +%WER 11.60 [ 3298 / 28424, 523 ins, 716 del, 2059 sub ] exp/tri4_fmmi_b0.1/decode_eval2_it8_csj/wer_10_0.5 +%WER 11.86 [ 3372 / 28424, 555 ins, 723 del, 2094 sub ] exp/tri4_mmi_b0.1/decode_eval2_1.mdl_csj/wer_14_0.0 +%WER 11.57 [ 3289 / 28424, 446 ins, 814 del, 2029 sub ] exp/tri4_mmi_b0.1/decode_eval2_2.mdl_csj/wer_13_0.5 +%WER 11.46 [ 3256 / 28424, 510 ins, 684 del, 2062 sub ] exp/tri4_mmi_b0.1/decode_eval2_3.mdl_csj/wer_11_0.5 +%WER 11.58 [ 3292 / 28424, 408 ins, 827 del, 2057 sub ] exp/tri4_mmi_b0.1/decode_eval2_4.mdl_csj/wer_11_1.0 +%WER 9.15 [ 2601 / 28424, 305 ins, 604 del, 1692 sub ] exp/dnn5b_pretrain-dbn_dnn/decode_eval2_csj/wer_12_0.0 +%WER 8.69 [ 2469 / 28424, 367 ins, 444 del, 1658 sub ] exp/dnn5b_pretrain-dbn_dnn_smbr/decode_eval2_csj/wer_12_1.0 +%WER 8.62 [ 2450 / 28424, 349 ins, 444 del, 1657 sub ] exp/dnn5b_pretrain-dbn_dnn_smbr_i1lats/decode_eval2_csj/wer_13_1.0 === evaluation set 3 === -%WER 25.01 [ 4573 / 18283, 529 ins, 1219 del, 2825 sub ] exp/tri1/decode_eval3_csj/wer_13 -%WER 23.62 [ 4319 / 18283, 499 ins, 1176 del, 2644 sub ] exp/tri2/decode_eval3_csj/wer_14 -%WER 18.04 [ 3298 / 18283, 528 ins, 739 del, 2031 sub ] exp/tri3/decode_eval3_csj/wer_12 -%WER 15.63 [ 2858 / 18283, 411 ins, 719 del, 1728 sub ] exp/tri4/decode_eval3_csj/wer_15 -%WER 19.36 [ 3540 / 18283, 506 ins, 836 del, 2198 sub ] exp/tri4/decode_eval3_csj.si/wer_17 -%WER 14.90 [ 2724 / 18283, 456 ins, 602 del, 1666 sub ] exp/tri4_fmmi_b0.1/decode_eval3_it4_csj/wer_13 -%WER 13.70 [ 2504 / 18283, 456 ins, 477 del, 1571 sub ] exp/tri4_fmmi_b0.1/decode_eval3_it5_csj/wer_13 -%WER 13.78 [ 2520 / 18283, 460 ins, 548 del, 1512 sub ] exp/tri4_fmmi_b0.1/decode_eval3_it6_csj/wer_12 -%WER 13.08 [ 2391 / 18283, 517 ins, 400 del, 1474 sub ] exp/tri4_fmmi_b0.1/decode_eval3_it7_csj/wer_12 -%WER 13.75 [ 2514 / 18283, 469 ins, 562 del, 1483 sub ] exp/tri4_fmmi_b0.1/decode_eval3_it8_csj/wer_12 -%WER 14.14 [ 2585 / 18283, 436 ins, 537 del, 1612 sub ] exp/tri4_mmi_b0.1/decode_eval3_1.mdl_csj/wer_14 -%WER 13.83 [ 2529 / 18283, 429 ins, 547 del, 1553 sub ] exp/tri4_mmi_b0.1/decode_eval3_2.mdl_csj/wer_14 -%WER 13.54 [ 2475 / 18283, 460 ins, 492 del, 1523 sub ] exp/tri4_mmi_b0.1/decode_eval3_3.mdl_csj/wer_13 -%WER 13.36 [ 2443 / 18283, 463 ins, 482 del, 1498 sub ] exp/tri4_mmi_b0.1/decode_eval3_4.mdl_csj/wer_13 -%WER 10.55 [ 1928 / 18283, 242 ins, 482 del, 1204 sub ] exp/dnn5b_pretrain-dbn_dnn/decode_eval3_csj/wer_13 -%WER 9.71 [ 1775 / 18283, 338 ins, 271 del, 1166 sub ] exp/dnn5b_pretrain-dbn_dnn_smbr/decode_eval3_csj/wer_13 -%WER 9.31 [ 1703 / 18283, 336 ins, 247 del, 1120 sub ] exp/dnn5b_pretrain-dbn_dnn_smbr_i1lats/decode_eval3_csj/wer_13 \ No newline at end of file +%WER 25.00 [ 4570 / 18283, 515 ins, 1277 del, 2778 sub ] exp/tri1/decode_eval3_csj/wer_14_0.0 +%WER 23.93 [ 4375 / 18283, 560 ins, 1163 del, 2652 sub ] exp/tri2/decode_eval3_csj/wer_14_0.0 +%WER 17.66 [ 3229 / 18283, 484 ins, 773 del, 1972 sub ] exp/tri3/decode_eval3_csj/wer_14_0.0 +%WER 15.46 [ 2827 / 18283, 311 ins, 860 del, 1656 sub ] exp/tri4/decode_eval3_csj/wer_17_0.5 +%WER 18.92 [ 3459 / 18283, 424 ins, 910 del, 2125 sub ] exp/tri4/decode_eval3_csj.si/wer_16_0.5 +%WER 14.55 [ 2661 / 18283, 423 ins, 629 del, 1609 sub ] exp/tri4_fmmi_b0.1/decode_eval3_it4_csj/wer_14_0.0 +%WER 13.38 [ 2446 / 18283, 362 ins, 572 del, 1512 sub ] exp/tri4_fmmi_b0.1/decode_eval3_it5_csj/wer_13_0.5 +%WER 13.37 [ 2444 / 18283, 484 ins, 470 del, 1490 sub ] exp/tri4_fmmi_b0.1/decode_eval3_it6_csj/wer_11_0.0 +%WER 12.96 [ 2370 / 18283, 332 ins, 570 del, 1468 sub ] exp/tri4_fmmi_b0.1/decode_eval3_it7_csj/wer_12_1.0 +%WER 13.62 [ 2490 / 18283, 440 ins, 549 del, 1501 sub ] exp/tri4_fmmi_b0.1/decode_eval3_it8_csj/wer_10_0.5 +%WER 13.77 [ 2518 / 18283, 323 ins, 664 del, 1531 sub ] exp/tri4_mmi_b0.1/decode_eval3_1.mdl_csj/wer_15_0.5 +%WER 13.48 [ 2464 / 18283, 334 ins, 618 del, 1512 sub ] exp/tri4_mmi_b0.1/decode_eval3_2.mdl_csj/wer_13_0.5 +%WER 13.28 [ 2428 / 18283, 379 ins, 546 del, 1503 sub ] exp/tri4_mmi_b0.1/decode_eval3_3.mdl_csj/wer_12_0.5 +%WER 13.26 [ 2424 / 18283, 388 ins, 543 del, 1493 sub ] exp/tri4_mmi_b0.1/decode_eval3_4.mdl_csj/wer_12_0.5 +%WER 10.41 [ 1904 / 18283, 289 ins, 422 del, 1193 sub ] exp/dnn5b_pretrain-dbn_dnn/decode_eval3_csj/wer_10_0.0 +%WER 9.34 [ 1707 / 18283, 251 ins, 341 del, 1115 sub ] exp/dnn5b_pretrain-dbn_dnn_smbr/decode_eval3_csj/wer_13_1.0 +%WER 9.10 [ 1664 / 18283, 246 ins, 344 del, 1074 sub ] exp/dnn5b_pretrain-dbn_dnn_smbr_i1lats/decode_eval3_csj/wer_14_1.0 diff --git a/egs/csj/s5/cmd.sh b/egs/csj/s5/cmd.sh index d5952fe0f87..71dd849a93b 100644 --- a/egs/csj/s5/cmd.sh +++ b/egs/csj/s5/cmd.sh @@ -1,31 +1,15 @@ -# "queue.pl" uses qsub. The options to it are -# options to qsub. If you have GridEngine installed, -# change this to a queue you have access to. -# Otherwise, use "run.pl", which will run jobs locally -# (make sure your --num-jobs options are no more than -# the number of cpus on your machine. - -#a) JHU cluster options -#export train_cmd="queue.pl -l arch=*64*" -#export decode_cmd="queue.pl -l arch=*64* -l ram_free=4G,mem_free=4G" -export train_cmd="run.pl" -export decode_cmd="run.pl" -#export cuda_cmd="..." -#export mkgraph_cmd="queue.pl -l arch=*64* -l ram_free=4G,mem_free=4G" -export mkgraph_cmd="run.pl" - -#b) BUT cluster options -#export train_cmd="queue.pl -q all.q@@blade -l ram_free=1200M,mem_free=1200M" -#export decode_cmd="queue.pl -q all.q@@blade -l ram_free=1700M,mem_free=1700M" -#export decodebig_cmd="queue.pl -q all.q@@blade -l ram_free=4G,mem_free=4G" -#export cuda_cmd="queue.pl -q long.q@@pco203 -l gpu=1" -#export cuda_cmd="queue.pl -q long.q@pcspeech-gpu" -#export mkgraph_cmd="queue.pl -q all.q@@servers -l ram_free=4G,mem_free=4G" - -#c) run it locally... -#export train_cmd=run.pl -#export decode_cmd=run.pl -export cuda_cmd=run.pl -#export mkgraph_cmd=run.pl - - +# you can change cmd.sh depending on what type of queue you are using. +# If you have no queueing system and want to run on a local machine, you +# can change all instances 'queue.pl' to run.pl (but be careful and run +# commands one by one: most recipes will exhaust the memory on your +# machine). queue.pl works with GridEngine (qsub). slurm.pl works +# with slurm. Different queues are configured differently, with different +# queue names and different ways of specifying things like memory; +# to account for these differences you can create and edit the file +# conf/queue.conf to match your queue's configuration. Search for +# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, +# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. + +export train_cmd="queue.pl --mem 2G" +export decode_cmd="queue.pl --mem 4G" +export mkgraph_cmd="queue.pl --mem 8G" diff --git a/egs/csj/s5/conf/config_opt b/egs/csj/s5/conf/config_opt index 5868d671c3e..e91c33abfa2 100644 --- a/egs/csj/s5/conf/config_opt +++ b/egs/csj/s5/conf/config_opt @@ -3,7 +3,8 @@ # Apache 2.0 # Acknowledgement This work was supported by JSPS KAKENHI Grant Number 26280055. -# Current optimized parameter config for CSJ +# Currently optimized parameter config for CSJ + splice=17 nn_depth=6 hid_dim=1905 diff --git a/egs/csj/s5/conf/mfcc.conf b/egs/csj/s5/conf/mfcc.conf index 0e7dfcd69b0..a5b1cbc03a3 100644 --- a/egs/csj/s5/conf/mfcc.conf +++ b/egs/csj/s5/conf/mfcc.conf @@ -1,3 +1,2 @@ --use-energy=false # only non-default option. -#--sample-frequency=8000 # Switchboard is sampled at 8kHz --sample-frequency=16000 # CSJ is sampled at 16kHz diff --git a/egs/csj/s5/local/csj_data_prep.sh b/egs/csj/s5/local/csj_data_prep.sh index 7458c0ce395..73462f17832 100644 --- a/egs/csj/s5/local/csj_data_prep.sh +++ b/egs/csj/s5/local/csj_data_prep.sh @@ -50,7 +50,7 @@ cat $CSJ/dvd{3,5,6,7,8,9,10}/{A*,M*}/*-wav.list 2>/dev/null | sort > $dir/wav.fl n=`cat $dir/wav.flist | wc -l` [ $n -ne 986 ] && \ - echo Warning: expected 986 data data files, found $n + echo "Warning: expected 986 data files (Case : Using 'Academic lecture' and 'Other' data), found $n." # (1a) Transcriptions preparation @@ -102,7 +102,7 @@ awk '{segment=$1; split(segment,S,"[_]"); spkid=S[1]; print $1 " " spkid}' $dir/ sort -k 2 $dir/utt2spk | utils/utt2spk_to_spk2utt.pl > $dir/spk2utt || exit 1; -# Copy stuff into its final locations. +# Copy stuff into its final locations [this has been moved from the format_data script] mkdir -p data/train for f in spk2utt utt2spk wav.scp text segments; do cp data/local/train/$f data/train/$f || exit 1; diff --git a/egs/csj/s5/local/csj_eval_data_prep.sh b/egs/csj/s5/local/csj_eval_data_prep.sh index 623197775e5..a8b848de4e2 100644 --- a/egs/csj/s5/local/csj_eval_data_prep.sh +++ b/egs/csj/s5/local/csj_eval_data_prep.sh @@ -9,7 +9,7 @@ # To be run from one directory above this script. -# The input is directory name containing the official evaluation test set. +# The input is directory containing the official evaluation test set and transcripts. if [ $# -ne 2 ]; then echo "Usage: "`basename $0`" " diff --git a/egs/csj/s5/local/csj_make_trans/csj2kaldi4m.pl b/egs/csj/s5/local/csj_make_trans/csj2kaldi4m.pl index 7895fa3410d..05ff93a54f8 100755 --- a/egs/csj/s5/local/csj_make_trans/csj2kaldi4m.pl +++ b/egs/csj/s5/local/csj_make_trans/csj2kaldi4m.pl @@ -204,8 +204,10 @@ $word =~ s/\ン\ー/\ン/g; # $word =~ s/\ヮ/\ワ/g; $word =~ s/\ゎ/\わ/g; - $word =~ s/^\ゼロ$/\0/g; - $word =~ s/^\零$/\0/g; + + # Normalization +# $word =~ s/^\ゼロ$/\0/g; +# $word =~ s/^\零$/\0/g; # Arrange morpheme # This function is to arrange morpheme. diff --git a/egs/csj/s5/local/csj_make_trans/csj_automake.sh b/egs/csj/s5/local/csj_make_trans/csj_automake.sh index 132725c0466..8dbb507a631 100644 --- a/egs/csj/s5/local/csj_make_trans/csj_automake.sh +++ b/egs/csj/s5/local/csj_make_trans/csj_automake.sh @@ -11,22 +11,17 @@ if [ $# -ne 2 ]; then exit 1 fi - resource=$1 outd=$2 -csjext=./local/csj_make_trans/csj2kaldi4m.pl -csjconnect=./local/csj_make_trans/csjconnect.pl -k2phone=./local/csj_make_trans/kana2phone -vocab2dic=./local/csj_make_trans/vocab2dic.pl -reform=./local/csj_make_trans/reform.pl +[ ! -e $resource ] && echo "Not exist CSJ or incorrect PATH." && exit 1; -if [ ! -d ./csj-data/dvd17 ];then +if [ ! -e $outd/.done_make_trans ];then ( mkdir -p $outd rm $outd/al_sent4lex.txt -cp ./local/csj_make_trans/overview_csj-data $outd/README.txt +cp local/csj_make_trans/overview_csj-data $outd/README.txt # Make transcription file for each dvd and each lecture [ ! -x "`which nkf `" ]\ @@ -35,19 +30,14 @@ cp ./local/csj_make_trans/overview_csj-data $outd/README.txt for vol in dvd{3..17} ;do mkdir -p $outd/$vol + ( for id in `ls $resource/$vol`;do mkdir -p $outd/$vol/${id} rm -r $outd/$vol/00README.txt - - ( nkf -e -d $resource/$vol/$id/${id}.sdb > $outd/$vol/${id}/sdb.tmp - $csjext $outd/$vol/${id}/sdb.tmp $outd/$vol/$id/${id}.4lex $outd/$vol/$id/${id}.4trn.t - - $csjconnect 0.5 10 $outd/$vol/$id/${id}.4trn.t $id > $outd/$vol/$id/${id}-trans.text - + local/csj_make_trans/csj2kaldi4m.pl $outd/$vol/${id}/sdb.tmp $outd/$vol/$id/${id}.4lex $outd/$vol/$id/${id}.4trn.t - [ -z `grep $id local/csj_make_trans/testset` ]\ - && cat $outd/$vol/$id/${id}.4lex >> $outd/al_sent4lex.txt + local/csj_make_trans/csjconnect.pl 0.5 10 $outd/$vol/$id/${id}.4trn.t $id > $outd/$vol/$id/${id}-trans.text rm $outd/$vol/$id/{${id}.4trn.t,sdb.tmp} @@ -56,30 +46,16 @@ for vol in dvd{3..17} ;do else find $resource/$vol/$id -iname ${id}.wav >$outd/$vol/$id/${id}-wav.list fi - - - ) done + )& done wait +echo -n >$outd/.done_make_trans ) fi -## make lexicon.txt -if [ ! -f ./csj-data/lexicon/lexicon.txt ]; then - ( - mkdir -p $outd/lexicon - sort $outd/al_sent4lex.txt >lex.tmp123 - uniq lex.tmp123 > lex.tmp456 - ${vocab2dic} -p $k2phone -o lex.tmp123 lex.tmp456 - $reform lex.tmp123 | sort | uniq > $outd/lexicon/lexicon.txt - mv $outd/al_sent4lex.txt $outd/lexicon - rm lex.tmp123 lex.tmp456 ERROR - ) -fi - ## Exclude speech data given by test set speakers. -if [ ! -d ./csj-data/[eval,excluded] ]; then +if [ ! -e $outd/.done_mv_eval_dup ]; then ( mkdir -p $outd/eval mkdir -p $outd/excluded @@ -89,10 +65,10 @@ if [ ! -d ./csj-data/[eval,excluded] ]; then # Speech data given by test set speakers (eval2 : A01M0056) rm dup_list - for line in `cat local/csj_make_trans/A01M0056_duplication | less`; do + for line in `cat local/csj_make_trans/A01M0056_duplication`; do find $outd/dvd* -iname $line >>dup_list done - for list in `cat dup_list | less`;do + for list in `cat dup_list`;do mv $list $outd/excluded cp dup_list $outd/excluded/duplication.list done @@ -100,10 +76,10 @@ if [ ! -d ./csj-data/[eval,excluded] ]; then # Evaluation data rm dup_list - for line in `cat local/csj_make_trans/testset | less`; do + for line in `cat local/csj_make_trans/testset`; do find $outd/dvd* -iname $line >>dup_list done - for list in `cat dup_list | less`;do + for list in `cat dup_list`;do mv $list $outd/eval cp dup_list $outd/eval/evaluation.list done @@ -114,11 +90,28 @@ if [ ! -d ./csj-data/[eval,excluded] ]; then mv $outd/eval/{A01M0110,A01M0137,A01M0097,A04M0123,A04M0121,A04M0051,A03M0156,A03M0112,A03M0106,A05M0011} $outd/eval/eval1 mv $outd/eval/{A01M0056,A03F0072,A02M0012,A03M0016,A06M0064,A06F0135,A01F0034,A01F0063,A01F0001,A01M0141} $outd/eval/eval2 mv $outd/eval/{S00M0112,S00F0066,S00M0213,S00F0019,S00M0079,S01F0105,S00F0152,S00M0070,S00M0008,S00F0148} $outd/eval/eval3 + + echo -n >$outd/.done_mv_eval_dup + ) +fi + +## make lexicon.txt +if [ ! -e $outd/.done_make_lexicon ]; then + ( + cat $outd/{dvd*,excluded}/*/*.4lex >> $outd/al_sent4lex.txt + mkdir -p $outd/lexicon + sort $outd/al_sent4lex.txt >lex.tmp123 + uniq lex.tmp123 > lex.tmp456 + local/csj_make_trans/vocab2dic.pl -p local/csj_make_trans/kana2phone -o lex.tmp123 lex.tmp456 + local/csj_make_trans/reform.pl lex.tmp123 | sort | uniq > $outd/lexicon/lexicon.txt + mv $outd/al_sent4lex.txt $outd/lexicon + rm lex.tmp123 lex.tmp456 ERROR + + echo -n >$outd/.done_make_lexicon ) fi -comp_num=`ls -l $outd | wc -l` -[ ! $comp_num -eq 20 ] \ +[ ! 3 -le `ls -a $outd | grep done | wc -l` ] \ && echo "ERROR : Processing is incorrect." && exit 1; -echo "Finish processing original CSJ data" +echo "Finish processing original CSJ data" && echo -n >$outd/.done_make_all diff --git a/egs/csj/s5/local/csj_make_trans/kana2phone b/egs/csj/s5/local/csj_make_trans/kana2phone index 76a0a4bff9e..6979a320389 100644 --- a/egs/csj/s5/local/csj_make_trans/kana2phone +++ b/egs/csj/s5/local/csj_make_trans/kana2phone @@ -141,4 +141,4 @@ ヴ+b u ツ+ts u シ+sh i -チ+ch i +チ+ch i diff --git a/egs/csj/s5/local/csj_make_trans/reform.pl b/egs/csj/s5/local/csj_make_trans/reform.pl index 1c267e2c491..d9f6ac3058b 100755 --- a/egs/csj/s5/local/csj_make_trans/reform.pl +++ b/egs/csj/s5/local/csj_make_trans/reform.pl @@ -1,4 +1,6 @@ #!/usr/bin/env perl +use warnings; + # Copyright 2015 Tokyo Institute of Technology (Authors: Takafumi Moriya and Takahiro Shinozaki) # 2015 Mitsubishi Electric Research Laboratories (Author: Shinji Watanabe) # Apache 2.0 @@ -6,8 +8,6 @@ # This script is to make lexicon for KALDI format. -use warnings; - while (<>){ chomp; @line=split(/\t/, $_); diff --git a/egs/csj/s5/local/nnet/run_dnn.sh b/egs/csj/s5/local/nnet/run_dnn.sh index 028be0b03e7..b0acce39d15 100644 --- a/egs/csj/s5/local/nnet/run_dnn.sh +++ b/egs/csj/s5/local/nnet/run_dnn.sh @@ -25,7 +25,7 @@ # Config: config=conf/config_opt . $config -gmmdir=exp/tri4 +gmmdir=exp/tri4 data_fmllr=data-fmllr-tri4 stage=0 # resume training with --stage=N # End of config. @@ -60,7 +60,7 @@ if [ $stage -le 1 ]; then fi -if [ $stage -le 2 ]; then +if [ $stage -le 2 ]; then # Train the DNN optimizing per-frame cross-entropy. dir=exp/dnn5b_pretrain-dbn_dnn ali=${gmmdir}_ali_nodup @@ -86,7 +86,7 @@ dir=exp/dnn5b_pretrain-dbn_dnn_smbr srcdir=exp/dnn5b_pretrain-dbn_dnn acwt=0.0909 -if [ $stage -le 3 ]; then +if [ $stage -le 3 ]; then # First we generate lattices and alignments: steps/nnet/align.sh --nj 10 --cmd "$train_cmd" \ $data_fmllr/train_nodup data/lang $srcdir ${srcdir}_ali || exit 1; diff --git a/egs/csj/s5/local/run_sgmm2.sh b/egs/csj/s5/local/run_sgmm2.sh index a5369e30205..ee836dc2043 100644 --- a/egs/csj/s5/local/run_sgmm2.sh +++ b/egs/csj/s5/local/run_sgmm2.sh @@ -17,14 +17,16 @@ steps/train_sgmm2_group.sh --cmd "$train_cmd" \ 18000 60000 data/train_nodup data/lang exp/tri4_ali_nodup \ exp/ubm5/final.ubm exp/sgmm2_5 || exit 1; + + +graph_dir=exp/sgmm2_5/graph_csj_tg +$train_cmd $graph_dir/mkgraph.log \ + utils/mkgraph.sh data/lang_csj_tg exp/sgmm2_5 $graph_dir for eval_num in `seq 3`; do - graph_dir=exp/sgmm2_5/graph_csj_tg - $train_cmd $graph_dir/mkgraph.log \ - utils/mkgraph.sh data/lang_csj_tg exp/sgmm2_5 $graph_dir steps/decode_sgmm2.sh --nj 10 \ --cmd "$decode_cmd" --config conf/decode.config \ - --transform-dir exp/tri4/decode_eval${eval_num}_csj_tg $graph_dir \ - data/eval${eval_num} exp/sgmm2_5/decode_eval${eval_num}_csj_tg + --transform-dir exp/tri4/decode_eval${eval_num}_csj $graph_dir \ + data/eval${eval_num} exp/sgmm2_5/decode_eval${eval_num}_csj done wait @@ -48,10 +50,10 @@ steps/train_mmi_sgmm2.sh --cmd "$decode_cmd" \ for eval_num in `seq 3`; do for iter in 1 2 3 4; do steps/decode_sgmm2_rescore.sh --cmd "$decode_cmd" --iter $iter \ - --transform-dir exp/tri4/decode_eval${eval_num}_csj_tg \ + --transform-dir exp/tri4/decode_eval${eval_num}_csj \ data/lang_csj_tg data/eval${eval_num} \ - exp/sgmm2_5/decode_eval${eval_num}_csj_tg \ - exp/sgmm2_5_mmi_b0.1/decode_eval${eval_num}_csj_tg_it$iter + exp/sgmm2_5/decode_eval${eval_num}_csj \ + exp/sgmm2_5_mmi_b0.1/decode_eval${eval_num}_csj_it$iter done done wait diff --git a/egs/csj/s5/local/score.sh b/egs/csj/s5/local/score.sh deleted file mode 100644 index 05981ab999e..00000000000 --- a/egs/csj/s5/local/score.sh +++ /dev/null @@ -1 +0,0 @@ -link ../steps/score_kaldi.sh \ No newline at end of file diff --git a/egs/csj/s5/local/score.sh b/egs/csj/s5/local/score.sh new file mode 120000 index 00000000000..0afefc3158c --- /dev/null +++ b/egs/csj/s5/local/score.sh @@ -0,0 +1 @@ +../steps/score_kaldi.sh \ No newline at end of file diff --git a/egs/csj/s5/local/wer_hyp_filter b/egs/csj/s5/local/wer_hyp_filter index c2911317399..d07b0cf4c28 100644 --- a/egs/csj/s5/local/wer_hyp_filter +++ b/egs/csj/s5/local/wer_hyp_filter @@ -3,5 +3,5 @@ perl -e 'foreach $w (@ARGV) { $bad{$w} = 1; } while() { @A = split(" ", $_); $id = shift @A; print "$id "; foreach $a (@A) { if (!defined $bad{$a}){ @W=split(/\+/,$a); $word=$W[0]; { print "$word "; }}} print "\n"; }' \ - '' + '' '' diff --git a/egs/csj/s5/local/wer_output_filter b/egs/csj/s5/local/wer_output_filter index c2911317399..d07b0cf4c28 100644 --- a/egs/csj/s5/local/wer_output_filter +++ b/egs/csj/s5/local/wer_output_filter @@ -3,5 +3,5 @@ perl -e 'foreach $w (@ARGV) { $bad{$w} = 1; } while() { @A = split(" ", $_); $id = shift @A; print "$id "; foreach $a (@A) { if (!defined $bad{$a}){ @W=split(/\+/,$a); $word=$W[0]; { print "$word "; }}} print "\n"; }' \ - '' + '' '' diff --git a/egs/csj/s5/local/wer_ref_filter b/egs/csj/s5/local/wer_ref_filter index c2911317399..d07b0cf4c28 100644 --- a/egs/csj/s5/local/wer_ref_filter +++ b/egs/csj/s5/local/wer_ref_filter @@ -3,5 +3,5 @@ perl -e 'foreach $w (@ARGV) { $bad{$w} = 1; } while() { @A = split(" ", $_); $id = shift @A; print "$id "; foreach $a (@A) { if (!defined $bad{$a}){ @W=split(/\+/,$a); $word=$W[0]; { print "$word "; }}} print "\n"; }' \ - '' + '' '' diff --git a/egs/csj/s5/path.sh b/egs/csj/s5/path.sh index 41f65d7a03c..8a4c29be4f8 100644 --- a/egs/csj/s5/path.sh +++ b/egs/csj/s5/path.sh @@ -1,8 +1,9 @@ export KALDI_ROOT=`pwd`/../../.. -export PATH=$PWD/utils/:$KALDI_ROOT/src/bin:$KALDI_ROOT/tools/openfst/bin:$KALDI_ROOT/src/fstbin/:$KALDI_ROOT/src/gmmbin/:$KALDI_ROOT/src/featbin/:$KALDI_ROOT/src/lm/:$KALDI_ROOT/src/sgmmbin/:$KALDI_ROOT/src/sgmm2bin/:$KALDI_ROOT/src/fgmmbin/:$KALDI_ROOT/src/latbin/:$KALDI_ROOT/src/nnetbin:$KALDI_ROOT/src/nnet2bin:$KALDI_ROOT/src/online2bin/:$KALDI_ROOT/src/ivectorbin/:$PWD:$PATH -#$KALDI_ROOT/tools/srilm/bin:$KALDI_ROOT/tools/srilm/bin/i686-m64:$KALDI_ROOT/tools/srilm/bin/i686:$PATH +export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH +[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1 +. $KALDI_ROOT/tools/config/common_path.sh export PATH=$PATH:/usr/local/cuda/bin export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda/lib64:/usr/local/lib:/usr/local/lib64:/usr/local/cuda/bin/nvcc -export LC_ALL=C +#export LC_ALL=C diff --git a/egs/csj/s5/run.sh b/egs/csj/s5/run.sh index 6c0af8106c8..fa5355f86f1 100644 --- a/egs/csj/s5/run.sh +++ b/egs/csj/s5/run.sh @@ -19,14 +19,14 @@ set -e # exit on error #: << '#SKIP' -if [ ! -d data/csj-data/eval ]; then +if [ ! -e data ]; then echo "CSJ transcription file does not exist" #local/csj_make_trans/csj_automake.sh || exit 1; - local/csj_make_trans/csj_automake.sh /database/NINJAL/CSJ/ data/csj-data 2>/dev/null + local/csj_make_trans/csj_automake.sh /database/NINJAL/CSJ data/csj-data 2>/dev/null fi wait -[ ! -d data/csj-data/eval ]\ +[ ! -e data/csj-data/.done_make_all ]\ && echo "Not finished processing CSJ data" && exit 1; # Prepare Corpus of Spontaneous Japanese (CSJ) data. @@ -36,7 +36,7 @@ local/csj_data_prep.sh data/csj-data/ local/csj_prepare_dict.sh -utils/prepare_lang.sh data/local/dict_nosp "" data/local/lang_nosp data/lang_nosp +utils/prepare_lang.sh --num-sil-states 4 data/local/dict_nosp "" data/local/lang_nosp data/lang_nosp # Now train the language models. local/csj_train_lms.sh data/local/train/text data/local/dict_nosp/lexicon.txt data/local/lm @@ -155,7 +155,7 @@ $train_cmd $graph_dir/mkgraph.log \ utils/mkgraph.sh data/lang_nosp_csj_tg exp/tri3 $graph_dir for eval_num in `seq 3`; do steps/decode.sh --nj 10 --cmd "$decode_cmd" --config conf/decode.config \ - $graph_dir data/eval${eval_num} exp/tri3/decode_eval${eval_num}_csj + $graph_dir data/eval${eval_num} exp/tri3/decode_eval${eval_num}_csj_nosp done # Now we compute the pronunciation and silence probabilities from training data, diff --git a/egs/csj/s5/steps b/egs/csj/s5/steps deleted file mode 100644 index 5e522274378..00000000000 --- a/egs/csj/s5/steps +++ /dev/null @@ -1 +0,0 @@ -link ../../wsj/s5/steps/ \ No newline at end of file diff --git a/egs/csj/s5/steps b/egs/csj/s5/steps new file mode 120000 index 00000000000..6e99bf5b5ad --- /dev/null +++ b/egs/csj/s5/steps @@ -0,0 +1 @@ +../../wsj/s5/steps \ No newline at end of file diff --git a/egs/csj/s5/utils b/egs/csj/s5/utils deleted file mode 100644 index 1ebeb7c52c7..00000000000 --- a/egs/csj/s5/utils +++ /dev/null @@ -1 +0,0 @@ -link ../../wsj/s5/utils/ \ No newline at end of file diff --git a/egs/csj/s5/utils b/egs/csj/s5/utils new file mode 120000 index 00000000000..b240885218f --- /dev/null +++ b/egs/csj/s5/utils @@ -0,0 +1 @@ +../../wsj/s5/utils \ No newline at end of file diff --git a/egs/farsdat/s5/cmd.sh b/egs/farsdat/s5/cmd.sh index d749f2c9f1f..71dd849a93b 100644 --- a/egs/farsdat/s5/cmd.sh +++ b/egs/farsdat/s5/cmd.sh @@ -1,25 +1,15 @@ -# "queue.pl" uses qsub. The options to it are -# options to qsub. If you have GridEngine installed, -# change this to a queue you have access to. -# Otherwise, use "run.pl", which will run jobs locally -# (make sure your --num-jobs options are no more than -# the number of cpus on your machine. - -#a) JHU cluster options -export train_cmd="queue.pl -l arch=*64" -export decode_cmd="queue.pl -l arch=*64,mem_free=2G,ram_free=2G" -export mkgraph_cmd="queue.pl -l arch=*64,ram_free=4G,mem_free=4G" -export cuda_cmd="run.pl" - - -#b) BUT cluster options -#export train_cmd="queue.pl -q all.q@blade[01][0126789][123456789] -l ram_free=2500M,mem_free=2500M,matylda5=0.5" -#export decode_cmd="queue.pl -q all.q@blade[01][0126789][123456789] -l ram_free=3000M,mem_free=3000M,matylda5=0.1" -#export mkgraph_cmd="queue.pl -q all.q@blade[01][0126789][123456789] -l ram_free=4G,mem_free=4G,matylda5=3" -#export cuda_cmd="queue.pl -q long.q@pcspeech-gpu,long.q@dellgpu*,long.q@pco203-0[0124] -l gpu=1" - -#c) run locally... -#export train_cmd=run.pl -#export decode_cmd=run.pl -#export cuda_cmd=run.pl -#export mkgraph_cmd=run.pl +# you can change cmd.sh depending on what type of queue you are using. +# If you have no queueing system and want to run on a local machine, you +# can change all instances 'queue.pl' to run.pl (but be careful and run +# commands one by one: most recipes will exhaust the memory on your +# machine). queue.pl works with GridEngine (qsub). slurm.pl works +# with slurm. Different queues are configured differently, with different +# queue names and different ways of specifying things like memory; +# to account for these differences you can create and edit the file +# conf/queue.conf to match your queue's configuration. Search for +# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, +# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. + +export train_cmd="queue.pl --mem 2G" +export decode_cmd="queue.pl --mem 4G" +export mkgraph_cmd="queue.pl --mem 8G" diff --git a/egs/farsdat/s5/local/farsdat_format_data.sh b/egs/farsdat/s5/local/farsdat_format_data.sh index 033538656bd..8e565f11fd0 100644 --- a/egs/farsdat/s5/local/farsdat_format_data.sh +++ b/egs/farsdat/s5/local/farsdat_format_data.sh @@ -25,13 +25,10 @@ for lm_suffix in bg; do test=data/lang_test_${lm_suffix} mkdir -p $test cp -r data/lang/* $test - + gunzip -c $lmdir/lm_phone_${lm_suffix}.arpa.gz | \ - egrep -v ' | | ' | \ - arpa2fst - | fstprint | \ - utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$test/words.txt \ - --osymbols=$test/words.txt --keep_isymbols=false --keep_osymbols=false | \ - fstrmepsilon | fstarcsort --sort_type=ilabel > $test/G.fst + arpa2fst --disambig-symbol=#0 \ + --read-symbol-table=$test/words.txt - $test/G.fst fstisstochastic $test/G.fst # The output is like: # 9.14233e-05 -0.259833 @@ -49,7 +46,7 @@ for lm_suffix in bg; do < "$lexicon" >$tmpdir/g/select_empty.fst.txt fstcompile --isymbols=$test/words.txt --osymbols=$test/words.txt $tmpdir/g/select_empty.fst.txt | \ fstarcsort --sort_type=olabel | fstcompose - $test/G.fst > $tmpdir/g/empty_words.fst - fstinfo $tmpdir/g/empty_words.fst | grep cyclic | grep -w 'y' && + fstinfo $tmpdir/g/empty_words.fst | grep cyclic | grep -w 'y' && echo "Language model has cycles with empty words" && exit 1 rm -r $tmpdir/g done diff --git a/egs/farsdat/s5/local/farsdat_prepare_lm.sh b/egs/farsdat/s5/local/farsdat_prepare_lm.sh index 782e1e3ed8f..c04f756d438 100755 --- a/egs/farsdat/s5/local/farsdat_prepare_lm.sh +++ b/egs/farsdat/s5/local/farsdat_prepare_lm.sh @@ -25,13 +25,10 @@ for lm_suffix in bg; do test=data/lang_test_${lm_suffix} mkdir -p $test cp -r data/lang/* $test - + gunzip -c $lmdir/lm_phone_${lm_suffix}.arpa.gz | \ - egrep -v ' | | ' | \ - arpa2fst - | fstprint | \ - utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$test/words.txt \ - --osymbols=$test/words.txt --keep_isymbols=false --keep_osymbols=false | \ - fstrmepsilon | fstarcsort --sort_type=ilabel > $test/G.fst + arpa2fst --disambig-symbol=#0 \ + --read-symbol-table=$test/words.txt - $test/G.fst fstisstochastic $test/G.fst # The output is like: # 9.14233e-05 -0.259833 @@ -49,7 +46,7 @@ for lm_suffix in bg; do < "$lexicon" >$tmpdir/g/select_empty.fst.txt fstcompile --isymbols=$test/words.txt --osymbols=$test/words.txt $tmpdir/g/select_empty.fst.txt | \ fstarcsort --sort_type=olabel | fstcompose - $test/G.fst > $tmpdir/g/empty_words.fst - fstinfo $tmpdir/g/empty_words.fst | grep cyclic | grep -w 'y' && + fstinfo $tmpdir/g/empty_words.fst | grep cyclic | grep -w 'y' && echo "Language model has cycles with empty words" && exit 1 rm -r $tmpdir/g done diff --git a/egs/farsdat/s5/path.sh b/egs/farsdat/s5/path.sh index 1e48f21b323..62794699b41 100755 --- a/egs/farsdat/s5/path.sh +++ b/egs/farsdat/s5/path.sh @@ -1,4 +1,6 @@ export KALDI_ROOT=`pwd`/../../.. -[ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh -export PATH=$PWD/utils/:$KALDI_ROOT/src/bin:$KALDI_ROOT/tools/openfst/bin:$KALDI_ROOT/tools/irstlm/bin/:$KALDI_ROOT/src/fstbin/:$KALDI_ROOT/src/gmmbin/:$KALDI_ROOT/src/featbin/:$KALDI_ROOT/src/lm/:$KALDI_ROOT/src/sgmmbin/:$KALDI_ROOT/src/sgmm2bin/:$KALDI_ROOT/src/fgmmbin/:$KALDI_ROOT/src/latbin/:$KALDI_ROOT/src/nnetbin:$KALDI_ROOT/src/nnet2bin/:$KALDI_ROOT/src/kwsbin:$PWD:$PATH +[ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh +export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$KALDI_ROOT/tools/irstlm/bin/:$PWD:$PATH +[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1 +. $KALDI_ROOT/tools/config/common_path.sh export LC_ALL=C diff --git a/egs/fisher_callhome_spanish/s5/cmd.sh b/egs/fisher_callhome_spanish/s5/cmd.sh index ab29f13d4cc..88db78823a5 100755 --- a/egs/fisher_callhome_spanish/s5/cmd.sh +++ b/egs/fisher_callhome_spanish/s5/cmd.sh @@ -1,18 +1,15 @@ -# "queue.pl" uses qsub. The options to it are -# options to qsub. If you have GridEngine installed, -# change this to a queue you have access to. -# Otherwise, use "run.pl", which will run jobs locally -# (make sure your --num-jobs options are no more than -# the number of cpus on your machine. - -#train_cmd='queue.pl -q all.q@a03.clsp.jhu.edu,all.q@a06.clsp.jhu.edu,all.q@a05.clsp.jhu.edu,all.q@v01.clsp.jhu.edu,all.q@a10.clsp.jhu.edu,all.q@a04.clsp.jhu.edu,all.q@a13.clsp.jhu.edu,all.q@a11.clsp.jhu.edu -l arch=*64' -#decode_cmd='queue.pl -q all.q@a03.clsp.jhu.edu,all.q@a06.clsp.jhu.edu,all.q@a05.clsp.jhu.edu,all.q@v01.clsp.jhu.edu,all.q@a10.clsp.jhu.edu,all.q@a04.clsp.jhu.edu,all.q@a13.clsp.jhu.edu,all.q@a11.clsp.jhu.edu -l arch=*64' -train_cmd="queue.pl -l arch=*64" -decode_cmd="queue.pl -l arch=*64" -#train_cmd="run.pl" -# Do training locally. Note: for jobs on smallish subsets, -# it's way faster to run on a single machine with a handful of CPUs, as -# you avoid the latency of starting GridEngine jobs. - - - +# you can change cmd.sh depending on what type of queue you are using. +# If you have no queueing system and want to run on a local machine, you +# can change all instances 'queue.pl' to run.pl (but be careful and run +# commands one by one: most recipes will exhaust the memory on your +# machine). queue.pl works with GridEngine (qsub). slurm.pl works +# with slurm. Different queues are configured differently, with different +# queue names and different ways of specifying things like memory; +# to account for these differences you can create and edit the file +# conf/queue.conf to match your queue's configuration. Search for +# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, +# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. + +export train_cmd="queue.pl --mem 4G" +export decode_cmd="queue.pl --mem 4G" +export mkgraph_cmd="queue.pl --mem 8G" diff --git a/egs/fisher_callhome_spanish/s5/local/fsp_create_test_lang.sh b/egs/fisher_callhome_spanish/s5/local/fsp_create_test_lang.sh index 70d2886cecc..90250ff521b 100755 --- a/egs/fisher_callhome_spanish/s5/local/fsp_create_test_lang.sh +++ b/egs/fisher_callhome_spanish/s5/local/fsp_create_test_lang.sh @@ -1,4 +1,4 @@ -#!/bin/bash +#!/bin/bash # Copyright 2014 Gaurav Kumar. Apache 2.0 # @@ -12,26 +12,13 @@ arpa_lm=data/local/lm/3gram-mincount/lm_unpruned.gz mkdir -p data/lang_test cp -r data/lang/* data/lang_test -# grep -v ' ' etc. is only for future-proofing this script. Our -# LM doesn't have these "invalid combinations". These can cause -# determinization failures of CLG [ends up being epsilon cycles]. -# Note: remove_oovs.pl takes a list of words in the LM that aren't in -# our word list. Since our LM doesn't have any, we just give it -# /dev/null [we leave it in the script to show how you'd do it]. gunzip -c "$arpa_lm" | \ - grep -v ' ' | \ - grep -v ' ' | \ - grep -v ' ' | \ - arpa2fst - | fstprint | \ - utils/remove_oovs.pl /dev/null | \ - utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=data/lang_test/words.txt \ - --osymbols=data/lang_test/words.txt --keep_isymbols=false --keep_osymbols=false | \ - fstrmepsilon | fstarcsort --sort_type=ilabel > data/lang_test/G.fst - fstisstochastic data/lang_test/G.fst + arpa2fst --disambig-symbol=#0 \ + --read-symbol-table=data/lang_test/words.txt - data/lang_test/G.fst echo "Checking how stochastic G is (the first of these numbers should be small):" -fstisstochastic data/lang_test/G.fst +fstisstochastic data/lang_test/G.fst ## Check lexicon. ## just have a look and make sure it seems sane. @@ -60,4 +47,3 @@ fsttablecompose data/lang/L_disambig.fst data/lang_test/G.fst | \ echo "$0 succeeded" - diff --git a/egs/fisher_callhome_spanish/s5/local/fsp_data_prep.sh b/egs/fisher_callhome_spanish/s5/local/fsp_data_prep.sh index f453ab42058..8fe80b46784 100755 --- a/egs/fisher_callhome_spanish/s5/local/fsp_data_prep.sh +++ b/egs/fisher_callhome_spanish/s5/local/fsp_data_prep.sh @@ -1,13 +1,13 @@ #!/bin/bash # # Copyright 2014 Gaurav Kumar. Apache 2.0 -# The input is the Fisher Dataset which contains DISC1 and DISC2. (*.sph files) -# In addition the transcripts are needed as well. +# The input is the Fisher Dataset which contains DISC1 and DISC2. (*.sph files) +# In addition the transcripts are needed as well. # To be run from one directory above this script. # Note: when creating your own data preparation scripts, it's a good idea # to make sure that the speaker id (if present) is a prefix of the utterance -# id, that the output scp file is sorted on utterance id, and that the +# id, that the output scp file is sorted on utterance id, and that the # transcription file is exactly the same length as the scp file and is also # sorted on utterance id (missing transcriptions should be removed from the # scp file using e.g. scripts/filter_scp.pl) @@ -18,8 +18,8 @@ export LC_ALL=C if [ $# -lt 2 ]; then - echo "Arguments should be the location of the Spanish Fisher Speech and Transcript Directories, se -e ../run.sh for example." + echo "Usage: $0 " + echo "e.g.: $0 /home/mpost/data/LDC/LDC2010S01 /home/mpost/data/LDC/LDC2010T04" exit 1; fi @@ -72,20 +72,20 @@ fi speech_d1=$dir/links/LDC2010S01/DISC1/data/speech speech_d2=$dir/links/LDC2010S01/DISC2/data/speech -transcripts=$dir/links/LDC2010T04/data/transcripts - -fcount_d1=`find ${speech_d1} -iname '*.sph' | wc -l` -fcount_d2=`find ${speech_d2} -iname '*.sph' | wc -l` -fcount_t=`find ${transcripts} -iname '*.tdf' | wc -l` -#TODO:it seems like not all speech files have transcripts +transcripts=$dir/links/LDC2010T04/data/transcripts + +fcount_d1=`find ${speech_d1} -iname '*.sph' | wc -l` +fcount_d2=`find ${speech_d2} -iname '*.sph' | wc -l` +fcount_t=`find ${transcripts} -iname '*.tdf' | wc -l` +#TODO:it seems like not all speech files have transcripts #Now check if we got all the files that we needed -if [ $fcount_d1 != 411 -o $fcount_d2 != 408 -o $fcount_t != 819 ]; -then - echo "Incorrect number of files in the data directories" - echo "DISC1 and DISC2 should contain 411 and 408 .sph files respectively" - echo "The transcripts should contain 819 files" - exit 1; -fi +if [ $fcount_d1 != 411 -o $fcount_d2 != 408 -o $fcount_t != 819 ]; +then + echo "Incorrect number of files in the data directories" + echo "DISC1 and DISC2 should contain 411 and 408 .sph files respectively" + echo "The transcripts should contain 819 files" + exit 1; +fi if [ $stage -le 0 ]; then #Gather all the speech files together to create a file list @@ -105,7 +105,7 @@ if [ $stage -le 1 ]; then mv $tmpdir/reco2file_and_channel $dir/train_all/ fi -if [ $stage -le 2 ]; then +if [ $stage -le 2 ]; then sort $tmpdir/text.1 | grep -v '((' | \ awk '{if (NF > 1){ print; }}' | \ sed 's:<\s*[/]*\s*\s*for[ei][ei]g[nh]\s*\w*>::g' | \ @@ -149,7 +149,7 @@ if [ $stage -le 3 ]; then for f in `cat $tmpdir/train_sph.flist`; do # convert to absolute path readlink -e $f - done > $tmpdir/train_sph_abs.flist + done > $tmpdir/train_sph_abs.flist cat $tmpdir/train_sph_abs.flist | perl -ane 'm:/([^/]+)\.sph$: || die "bad line $_; "; print "$1 $_"; ' > $tmpdir/sph.scp cat $tmpdir/sph.scp | awk -v sph2pipe=$sph2pipe '{printf("%s-A %s -f wav -p -c 1 %s |\n", $1, sph2pipe, $2); printf("%s-B %s -f wav -p -c 2 %s |\n", $1, sph2pipe, $2);}' | \ diff --git a/egs/fisher_callhome_spanish/s5/local/fsp_prepare_dict.sh b/egs/fisher_callhome_spanish/s5/local/fsp_prepare_dict.sh index 0f2bd037ba0..6d04f53c7e5 100755 --- a/egs/fisher_callhome_spanish/s5/local/fsp_prepare_dict.sh +++ b/egs/fisher_callhome_spanish/s5/local/fsp_prepare_dict.sh @@ -22,12 +22,32 @@ lexicon=$1 #Get all unique words, remove punctuation. if [ $stage -le 0 ]; then cat $datadir/text | sed 's:[0-9][0-9]\S*::g' | sed 's:[\.,\?]::g' | tr " " "\n" | sort | uniq | awk '{if (NF > 0){ print; }}' > $tmpdir/uniquewords - if [ -f "/export/a04/gkumar/corpora/gigaword-spanish/bin/gigaword-lexicon.json" ]; then - # Merge with gigaword corpus - $local/merge_lexicons.py - mv $tmpdir/uniquewords $tmpdir/uniquewords.small - mv $tmpdir/uniquewords64k $tmpdir/uniquewords + if [ ! -f "${tmpdir}/es_wordlist.json" ]; then + echo "Could not find the large collection of Spanish words es_wordlist.json" + echo "Trying to download it via wget" + + if ! which wget >&/dev/null; then + echo "This script requires you to first install wget" + exit 1; + fi + + cwd=`pwd` + cd $tmpdir + wget -T 10 -t 3 -c http://www.openslr.org/resources/21/es_wordlist.json.tgz + + if [ ! -e ${tmpdir}/es_wordlist.json.tgz ]; then + echo "Download of the large Spanish word list failed" + exit 1; + fi + + tar -xovzf es_wordlist.json.tgz || exit 1; + cd $cwd fi + + # Merge with gigaword corpus + $local/merge_lexicons.py ${tmpdir} ${lexicon} + mv $tmpdir/uniquewords $tmpdir/uniquewords.small + mv $tmpdir/uniquewords64k $tmpdir/uniquewords fi #Then get the list of phones form basic_rules in the lexicon folder @@ -50,6 +70,7 @@ if [ $stage -le 2 ]; then # representation cat $tmpdir/uniquewords | $local/spron.pl $lexicon/callhome_spanish_lexicon_970908/preferences $lexicon/callhome_spanish_lexicon_970908/basic_rules \ | cut -f1 | sed -r 's:#\S+\s\S+\s\S+\s\S+\s(\S+):\1:g' \ + | awk -F '[/][/]' '{print $1}' \ > $tmpdir/lexicon_raw fi diff --git a/egs/fisher_callhome_spanish/s5/local/merge_lexicons.py b/egs/fisher_callhome_spanish/s5/local/merge_lexicons.py index 8c67ae56804..5c09f09bc35 100755 --- a/egs/fisher_callhome_spanish/s5/local/merge_lexicons.py +++ b/egs/fisher_callhome_spanish/s5/local/merge_lexicons.py @@ -7,55 +7,58 @@ import sys import json import codecs -import os import operator -wordlimit=64000 -uw_fisher="data/local/tmp/uniquewords" -uw_gigaword="/export/a04/gkumar/corpora/gigaword-spanish/bin/gigaword-lexicon.json" -uw_LDC="/export/corpora/LDC/LDC96L16/callhome_spanish_lexicon_970908/preferences" +wordlimit = 64000 +tmpdir = sys.argv[1] +ldc_lexicon = sys.argv[2] +uw_fisher = tmpdir + "/uniquewords" +uw_gigaword = tmpdir + "/es_wordlist.json" +uw_LDC = ldc_lexicon + "/callhome_spanish_lexicon_970908/preferences" merged_lexicon = [] # All three lexicons are in different formats # First add the data from lexicon_fisher (A) into the dictionary fisher = codecs.open(uw_fisher, encoding='utf-8') for line in fisher: - merged_lexicon.append(line.strip()) + merged_lexicon.append(line.strip()) fisher.close() -print "After adding the fisher data, the lexicon contains " + str(len(merged_lexicon)) + " entries." +print "After adding the fisher data, the lexicon contains " \ + + str(len(merged_lexicon)) + " entries." # Now add data from the LDC lexicon ldc = codecs.open(uw_LDC, encoding='iso-8859-1') -for line in ldc: - entries = line.strip().split('\t') - if entries[0].lower() not in merged_lexicon: - merged_lexicon.append(entries[0].lower()) +for line in ldc: + entries = line.strip().split('\t') + if entries[0].lower() not in merged_lexicon: + merged_lexicon.append(entries[0].lower()) -print "After adding the LDC data, the lexicon contains " + str(len(merged_lexicon)) + " entries." +print "After adding the LDC data, the lexicon contains " \ + + str(len(merged_lexicon)) + " entries." # Finally add the gigaword data gigaword = json.load(open(uw_gigaword)) gigaword = reversed(sorted(gigaword.iteritems(), key=operator.itemgetter(1))) for item in gigaword: - # We need a maximum of wordlimit words in the lexicon - if len(merged_lexicon) == wordlimit: - break + # We need a maximum of wordlimit words in the lexicon + if len(merged_lexicon) == wordlimit: + break - if item[0].lower() not in merged_lexicon: - merged_lexicon.append(item[0].lower()) - -print "After adding the Gigaword data, the lexicon contains " + str(len(merged_lexicon)) + " entries." + if item[0].lower() not in merged_lexicon: + merged_lexicon.append(item[0].lower()) + +print "After adding the Gigaword data, the lexicon contains " \ + + str(len(merged_lexicon)) + " entries." # Now write the uniquewords to a file -lf = codecs.open('data/local/tmp/uniquewords64k', encoding='utf-8', mode='w+') +lf = codecs.open(tmpdir + '/uniquewords64k', encoding='utf-8', mode='w+') ltuples = sorted(merged_lexicon) for item in ltuples: - lf.write(item + "\n") + lf.write(item + "\n") lf.close() print "Finshed writing unique words" - diff --git a/egs/fisher_callhome_spanish/s5/path.sh b/egs/fisher_callhome_spanish/s5/path.sh index 423d1dd0016..1a6fb5f891b 100755 --- a/egs/fisher_callhome_spanish/s5/path.sh +++ b/egs/fisher_callhome_spanish/s5/path.sh @@ -1,3 +1,5 @@ export KALDI_ROOT=`pwd`/../../.. -export PATH=$PWD/utils/:$KALDI_ROOT/src/bin:$KALDI_ROOT/src/nnet:$KALDI_ROOT/src/nnet2:$KALDI_ROOT/src/nnetbin:$KALDI_ROOT/src/nnet2bin:$KALDI_ROOT/tools/openfst/bin:$KALDI_ROOT/src/fstbin/:$KALDI_ROOT/src/gmmbin/:$KALDI_ROOT/src/featbin/:$KALDI_ROOT/src/lm/:$KALDI_ROOT/src/sgmmbin/:$KALDI_ROOT/src/sgmm2bin/:$KALDI_ROOT/src/fgmmbin/:$KALDI_ROOT/src/latbin/:$KALDI_ROOT/src/nnet-cpubin/:$PWD:$PATH +export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH +[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1 +. $KALDI_ROOT/tools/config/common_path.sh export LC_ALL=C diff --git a/egs/fisher_callhome_spanish/s5/run.sh b/egs/fisher_callhome_spanish/s5/run.sh index 706f3793278..edd7f56bad2 100755 --- a/egs/fisher_callhome_spanish/s5/run.sh +++ b/egs/fisher_callhome_spanish/s5/run.sh @@ -17,12 +17,10 @@ set -e sfisher_speech=/home/mpost/data/LDC/LDC2010S01 sfisher_transcripts=/home/mpost/data/LDC/LDC2010T04 spanish_lexicon=/export/corpora/LDC/LDC96L16 -#split=/export/a04/gkumar/corpora/fishcall/jack-splits/split-matt split=local/splits/split_fisher callhome_speech=/export/corpora/LDC/LDC96S35 callhome_transcripts=/export/corpora/LDC/LDC96T17 -#split_callhome=/export/a04/gkumar/corpora/fishcall/jack-splits/split-callhome split=local/splits/split_callhome local/fsp_data_prep.sh $sfisher_speech $sfisher_transcripts @@ -33,16 +31,16 @@ local/fsp_prepare_dict.sh $spanish_lexicon # Rewrite ----------------------------- This section is no longer needed---- # At this point, it might make sense to use a bigger lexicon -# The one I will use is derived from this exercise (spanish fisher) and -# the LDC spanish lexicon along with the most frequent words derived from the +# The one I will use is derived from this exercise (spanish fisher) and +# the LDC spanish lexicon along with the most frequent words derived from the # gigaword corpus such that the total number of entries in the lexicon # are 64k # To generate the merged lexicon, run # /export/a04/gkumar/corpora/gigaword/bin/merge_lexicons.py # you might have to set the locations of the three lexicons within this -# file. Note that the LDC rule base phoneme generator works only from its -# own directory. So the merged lexicon is actually created in +# file. Note that the LDC rule base phoneme generator works only from its +# own directory. So the merged lexicon is actually created in # /export/a04/gkumar/corpora/LDC9..../spanish_lexicon../lexicon64k # This can be easily fixed and will be done. #TODO # Also run the clean lexicon script to take care of non stressable vowels @@ -57,11 +55,11 @@ utils/prepare_lang.sh data/local/dict "" data/local/lang data/lang # Make sure that you do not use your test and your dev sets to train the LM -# Some form of cross validation is possible where you decode your dev/set based on an +# Some form of cross validation is possible where you decode your dev/set based on an # LM that is trained on everything but that that conversation # When in doubt about what your data partitions should be use local/fsp_ideal_data_partitions.pl -# to get the numbers. Depending on your needs, you might have to change the size of -# the splits within that file. The default paritions are based on the Kaldi + Joshua +# to get the numbers. Depending on your needs, you might have to change the size of +# the splits within that file. The default paritions are based on the Kaldi + Joshua # requirements which means that I have very large dev and test sets local/fsp_train_lms.sh $split local/fsp_create_test_lang.sh @@ -95,7 +93,7 @@ cp -r data/local/data/callhome_train_all data/callhome_train_all # MT Tune : Same as the ASR eval set (Use the lattices from here) # MT Eval : 20k utterances # The dev and the test sets need to be carefully chosen so that there is no conversation/speaker -# overlap. This has been setup and the script local/fsp_ideal_data_partitions provides the numbers that are needed below. +# overlap. This has been setup and the script local/fsp_ideal_data_partitions provides the numbers that are needed below. # As noted above, the LM has not been trained on the dev and the test sets. #utils/subset_data_dir.sh --first data/train_all 158126 data/dev_and_test #utils/subset_data_dir.sh --first data/dev_and_test 37814 data/asr_dev_and_test @@ -136,7 +134,7 @@ utils/subset_data_dir.sh --shortest data/train 90000 data/train_100kshort utils/subset_data_dir.sh data/train_100kshort 10000 data/train_10k local/remove_dup_utts.sh 100 data/train_10k data/train_10k_nodup utils/subset_data_dir.sh --speakers data/train 30000 data/train_30k -utils/subset_data_dir.sh --speakers data/train 90000 data/train_100k +utils/subset_data_dir.sh --speakers data/train 90000 data/train_100k steps/train_mono.sh --nj 10 --cmd "$train_cmd" \ data/train_10k_nodup data/lang exp/mono0a @@ -178,7 +176,7 @@ steps/train_lda_mllt.sh --cmd "$train_cmd" \ exp/tri3a/graph data/dev exp/tri3a/decode_dev || exit 1; )& -# Next we'll use fMLLR and train with SAT (i.e. on +# Next we'll use fMLLR and train with SAT (i.e. on # fMLLR features) steps/align_fmllr.sh --nj 30 --cmd "$train_cmd" \ diff --git a/egs/fisher_english/s5/cmd.sh b/egs/fisher_english/s5/cmd.sh index a4a11bef039..88db78823a5 100644 --- a/egs/fisher_english/s5/cmd.sh +++ b/egs/fisher_english/s5/cmd.sh @@ -1,29 +1,15 @@ -# "queue.pl" uses qsub. The options to it are -# options to qsub. If you have GridEngine installed, -# change this to a queue you have access to. -# Otherwise, use "run.pl", which will run jobs locally -# (make sure your --num-jobs options are no more than -# the number of cpus on your machine. - -#a) JHU cluster options -export train_cmd="queue.pl -l arch=*64" -export decode_cmd="queue.pl -l arch=*64,mem_free=2G,ram_free=2G" -export mkgraph_cmd="queue.pl -l arch=*64,ram_free=4G,mem_free=4G" - -#export cuda_cmd="..." - - -#b) BUT cluster options -#export train_cmd="queue.pl -q all.q@@blade -l ram_free=1200M,mem_free=1200M" -#export decode_cmd="queue.pl -q all.q@@blade -l ram_free=1700M,mem_free=1700M" -#export decodebig_cmd="queue.pl -q all.q@@blade -l ram_free=4G,mem_free=4G" - -#export cuda_cmd="queue.pl -q long.q@@pco203 -l gpu=1" -#export cuda_cmd="queue.pl -q long.q@pcspeech-gpu" -#export mkgraph_cmd="queue.pl -q all.q@@servers -l ram_free=4G,mem_free=4G" - -#c) run it locally... -#export train_cmd=run.pl -#export decode_cmd=run.pl -#export cuda_cmd=run.pl -#export mkgraph_cmd=run.pl +# you can change cmd.sh depending on what type of queue you are using. +# If you have no queueing system and want to run on a local machine, you +# can change all instances 'queue.pl' to run.pl (but be careful and run +# commands one by one: most recipes will exhaust the memory on your +# machine). queue.pl works with GridEngine (qsub). slurm.pl works +# with slurm. Different queues are configured differently, with different +# queue names and different ways of specifying things like memory; +# to account for these differences you can create and edit the file +# conf/queue.conf to match your queue's configuration. Search for +# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, +# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. + +export train_cmd="queue.pl --mem 4G" +export decode_cmd="queue.pl --mem 4G" +export mkgraph_cmd="queue.pl --mem 8G" diff --git a/egs/fisher_english/s5/local/fisher_create_test_lang.sh b/egs/fisher_english/s5/local/fisher_create_test_lang.sh index aaa45f8e4e1..1d7c4013b83 100755 --- a/egs/fisher_english/s5/local/fisher_create_test_lang.sh +++ b/egs/fisher_english/s5/local/fisher_create_test_lang.sh @@ -1,4 +1,4 @@ -#!/bin/bash +#!/bin/bash # if [ -f path.sh ]; then . path.sh; fi @@ -11,26 +11,13 @@ arpa_lm=data/local/lm/3gram-mincount/lm_unpruned.gz mkdir -p data/lang_test cp -r data/lang/* data/lang_test -# grep -v ' ' etc. is only for future-proofing this script. Our -# LM doesn't have these "invalid combinations". These can cause -# determinization failures of CLG [ends up being epsilon cycles]. -# Note: remove_oovs.pl takes a list of words in the LM that aren't in -# our word list. Since our LM doesn't have any, we just give it -# /dev/null [we leave it in the script to show how you'd do it]. gunzip -c "$arpa_lm" | \ - grep -v ' ' | \ - grep -v ' ' | \ - grep -v ' ' | \ - arpa2fst - | fstprint | \ - utils/remove_oovs.pl /dev/null | \ - utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=data/lang_test/words.txt \ - --osymbols=data/lang_test/words.txt --keep_isymbols=false --keep_osymbols=false | \ - fstrmepsilon | fstarcsort --sort_type=ilabel > data/lang_test/G.fst - fstisstochastic data/lang_test/G.fst + arpa2fst --disambig-symbol=#0 \ + --read-symbol-table=data/lang_test/words.txt - data/lang_test/G.fst echo "Checking how stochastic G is (the first of these numbers should be small):" -fstisstochastic data/lang_test/G.fst +fstisstochastic data/lang_test/G.fst ## Check lexicon. ## just have a look and make sure it seems sane. @@ -59,4 +46,3 @@ fsttablecompose data/lang/L_disambig.fst data/lang_test/G.fst | \ echo "$0 succeeded" - diff --git a/egs/fisher_english/s5/local/nnet2/run_6c_gpu.sh b/egs/fisher_english/s5/local/nnet2/run_6c_gpu.sh index be2548cc667..eae5f7b8581 100755 --- a/egs/fisher_english/s5/local/nnet2/run_6c_gpu.sh +++ b/egs/fisher_english/s5/local/nnet2/run_6c_gpu.sh @@ -34,7 +34,7 @@ parallel_opts="-l gpu=1" # This is suitable for the CLSP network, you'll likely # note: 12 epochs is too many, it's taking a very long time. steps/nnet2/train_pnorm_simple2.sh --stage $train_stage \ --num-epochs 12 \ - --io-opts "-tc 10" \ + --io-opts "--max-jobs-run 10" \ --num-jobs-nnet 8 --num-threads 1 \ --minibatch-size 512 --parallel-opts "$parallel_opts" \ --mix-up 15000 \ diff --git a/egs/fisher_english/s5/local/online/run_nnet2.sh b/egs/fisher_english/s5/local/online/run_nnet2.sh index 97f3d655b78..0b9adb7d315 100755 --- a/egs/fisher_english/s5/local/online/run_nnet2.sh +++ b/egs/fisher_english/s5/local/online/run_nnet2.sh @@ -39,7 +39,7 @@ if [ $stage -le 6 ]; then # Because we have a lot of data here and we don't want the training to take # too long, we reduce the number of epochs from the defaults (15 + 5) to (3 + - # 1). The option "--io-opts '-tc 12'" is to have more than the default number + # 1). The option "--io-opts '--max-jobs-run 12'" is to have more than the default number # (5) of jobs dumping the egs to disk; this is OK since we're splitting our # data across four filesystems for speed. @@ -52,7 +52,7 @@ if [ $stage -le 6 ]; then --num-threads "$num_threads" \ --minibatch-size "$minibatch_size" \ --parallel-opts "$parallel_opts" \ - --io-opts "-tc 12" \ + --io-opts "--max-jobs-run 12" \ --num-jobs-nnet 6 \ --num-hidden-layers 4 \ --mix-up 12000 \ diff --git a/egs/fisher_english/s5/local/online/run_nnet2_b.sh b/egs/fisher_english/s5/local/online/run_nnet2_b.sh index e9e0041cf0e..7eac7cf0a7d 100755 --- a/egs/fisher_english/s5/local/online/run_nnet2_b.sh +++ b/egs/fisher_english/s5/local/online/run_nnet2_b.sh @@ -76,7 +76,7 @@ if [ $stage -le 4 ]; then # Because we have a lot of data here and we don't want the training to take # too long, we reduce the number of epochs from the defaults (15 + 5) to (1 + - # 1). The option "--io-opts '-tc 12'" is to have more than the default number + # 1). The option "--io-opts '--max-jobs-run 12'" is to have more than the default number # (5) of jobs dumping the egs to disk; this is OK since we're splitting our # data across four filesystems for speed. @@ -89,7 +89,7 @@ if [ $stage -le 4 ]; then --num-threads "$num_threads" \ --minibatch-size "$minibatch_size" \ --parallel-opts "$parallel_opts" \ - --io-opts "-tc 12" \ + --io-opts "--max-jobs-run 12" \ --num-jobs-nnet 6 \ --num-hidden-layers 5 \ --mix-up 12000 \ diff --git a/egs/fisher_english/s5/local/online/run_nnet2_multisplice.sh b/egs/fisher_english/s5/local/online/run_nnet2_multisplice.sh index 37a0f91d7cb..47ba36f0072 100755 --- a/egs/fisher_english/s5/local/online/run_nnet2_multisplice.sh +++ b/egs/fisher_english/s5/local/online/run_nnet2_multisplice.sh @@ -15,13 +15,13 @@ set -e # assume use_gpu=true since it would be way too slow otherwise. if ! cuda-compiled; then - cat <&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1 +. $KALDI_ROOT/tools/config/common_path.sh export LC_ALL=C diff --git a/egs/fisher_swbd/s5/RESULTS b/egs/fisher_swbd/s5/RESULTS index 77306f8df4e..b8fe8371631 100644 --- a/egs/fisher_swbd/s5/RESULTS +++ b/egs/fisher_swbd/s5/RESULTS @@ -42,8 +42,77 @@ for x in exp/nnet2_online/nnet_ms_a_online/decode_eval2000*_fg; do grep Sum $x/ %WER 12.3 | 1831 21395 | 89.2 7.2 3.5 1.5 12.3 50.8 | exp/nnet2_online/nnet_ms_a_online/decode_eval2000_utt_fsh_sw1_fg/score_13/eval2000.ctm.swbd.filt.sys %WER 11.8 | 1831 21395 | 89.6 7.2 3.2 1.4 11.8 49.0 | exp/nnet2_online/nnet_ms_a_online/decode_eval2000_utt_offline_fsh_sw1_fg/score_11/eval2000.ctm.swbd.filt.sys +# nnet3 result on eval2000 +# BLSTM ran for about 760 hours, command: +# local/nnet3/run_lstm.sh --affix bidirectional --lstm-delay " [-1,1] [-2,2] [-3,3] " --label-delay 0 \ +# --cell-dim 1024 --recurrent-projection-dim 128 --non-recurrent-projection-dim 128 \ +# --chunk-left-context 40 --chunk-right-context 40 \ +# --extra-left-context 50 --extra-right-context 50 +# use tri-gram +for x in exp/nnet3/*/decode_eval2000*tg; do grep Sum $x/score_*/*.ctm.filt.sys | utils/best_wer.sh ; done +%WER 15.8 | 4459 42989 | 86.1 9.7 4.1 1.9 15.8 52.6 | exp/nnet3/tdnn_sp/decode_eval2000_fsh_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +%WER 14.8 | 4459 42989 | 87.2 9.4 3.4 2.1 14.8 52.2 | exp/nnet3/tdnn_sp_smbr/decode_eval2000_fsh_sw1_tg_epoch2.adj/score_13_0.0/eval2000_hires.ctm.filt.sys +%WER 14.8 | 4459 42989 | 86.6 9.2 4.3 1.4 14.8 54.3 | exp/nnet3/lstm_bidirectional_sp/decode_eval2000_fsh_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# rescore with four-gram +for x in exp/nnet3/*/decode_eval2000*fg; do grep Sum $x/score_*/*.ctm.filt.sys | utils/best_wer.sh ; done +%WER 15.4 | 4459 42989 | 86.4 9.5 4.0 1.8 15.4 51.6 | exp/nnet3/tdnn_sp/decode_eval2000_fsh_sw1_fg/score_10_0.0/eval2000_hires.ctm.filt.sys +%WER 14.5 | 4459 42989 | 87.5 9.0 3.5 2.0 14.5 51.4 | exp/nnet3/tdnn_sp_smbr/decode_eval2000_fsh_sw1_fg_epoch2.adj/score_14_0.0/eval2000_hires.ctm.filt.sys +%WER 14.5 | 4459 42989 | 87.0 9.0 4.0 1.5 14.5 53.7 | exp/nnet3/lstm_bidirectional_sp/decode_eval2000_fsh_sw1_fg/score_8_0.0/eval2000_hires.ctm.filt.sys + +# nnet3 result on eval2000 for swbd subset +# use tri-gram +for x in exp/nnet3/*/decode_eval2000*tg; do grep Sum $x/score_*/*.ctm.swbd.filt.sys | utils/best_wer.sh ; done +%WER 11.6 | 1831 21395 | 89.7 7.3 3.0 1.3 11.6 47.7 | exp/nnet3/tdnn_sp/decode_eval2000_fsh_sw1_tg/score_10_0.0/eval2000_hires.ctm.swbd.filt.sys +%WER 10.3 | 1831 21395 | 91.0 6.4 2.5 1.3 10.3 45.9 | exp/nnet3/tdnn_sp_smbr/decode_eval2000_fsh_sw1_tg_epoch2.adj/score_19_0.0/eval2000_hires.ctm.swbd.filt.sys +%WER 10.7 | 1831 21395 | 90.3 6.7 3.0 1.0 10.7 45.9 | exp/nnet3/lstm_bidirectional_sp/decode_eval2000_fsh_sw1_tg/score_10_0.0/eval2000_hires.ctm.swbd.filt.sys +# rescore with four-gram +for x in exp/nnet3/*/decode_eval2000*fg; do grep Sum $x/score_*/*.ctm.swbd.filt.sys | utils/best_wer.sh ; done +%WER 11.1 | 1831 21395 | 90.2 7.0 2.8 1.3 11.1 46.2 | exp/nnet3/tdnn_sp/decode_eval2000_fsh_sw1_fg/score_10_0.0/eval2000_hires.ctm.swbd.filt.sys +%WER 10.0 | 1831 21395 | 91.3 6.3 2.4 1.3 10.0 45.1 | exp/nnet3/tdnn_sp_smbr/decode_eval2000_fsh_sw1_fg_epoch2.adj/score_19_1.0/eval2000_hires.ctm.swbd.filt.sys +%WER 10.4 | 1831 21395 | 90.6 6.5 2.9 1.0 10.4 45.3 | exp/nnet3/lstm_bidirectional_sp/decode_eval2000_fsh_sw1_fg/score_10_0.0/eval2000_hires.ctm.swbd.filt.sys + +# nnet3 result on eval2000 for callhm subset +# use tri-gram +for x in exp/nnet3/*/decode_eval2000*tg; do grep Sum $x/score_*/*.ctm.callhm.filt.sys | utils/best_wer.sh ; done +%WER 19.9 | 2628 21594 | 82.6 12.1 5.3 2.6 19.9 56.0 | exp/nnet3/tdnn_sp/decode_eval2000_fsh_sw1_tg/score_10_0.0/eval2000_hires.ctm.callhm.filt.sys +%WER 19.0 | 2628 21594 | 83.5 11.7 4.8 2.5 19.0 56.5 | exp/nnet3/tdnn_sp_smbr/decode_eval2000_fsh_sw1_tg_epoch2.adj/score_14_0.5/eval2000_hires.ctm.callhm.filt.sys +%WER 18.8 | 2628 21594 | 83.1 11.7 5.2 1.9 18.8 60.2 | exp/nnet3/lstm_bidirectional_sp/decode_eval2000_fsh_sw1_tg/score_8_0.0/eval2000_hires.ctm.callhm.filt.sys +# rescore with four-gram +for x in exp/nnet3/*/decode_eval2000*fg; do grep Sum $x/score_*/*.ctm.swbd.filt.sys | utils/best_wer.sh ; done +%WER 19.7 | 2628 21594 | 82.7 12.1 5.2 2.4 19.7 55.3 | exp/nnet3/tdnn_sp/decode_eval2000_fsh_sw1_fg/score_10_0.0/eval2000_hires.ctm.callhm.filt.sys +%WER 18.7 | 2628 21594 | 83.7 11.5 4.8 2.5 18.7 55.6 | exp/nnet3/tdnn_sp_smbr/decode_eval2000_fsh_sw1_fg_epoch2.adj/score_14_0.0/eval2000_hires.ctm.callhm.filt.sys +%WER 18.6 | 2628 21594 | 83.3 11.5 5.2 1.9 18.6 59.6 | exp/nnet3/lstm_bidirectional_sp/decode_eval2000_fsh_sw1_fg/score_8_0.0/eval2000_hires.ctm.callhm.filt.sys +# chain result on eval2000 +# BLSTM ran for about 380 hours +# use tri-gram +for x in exp/chain/*/decode_eval2000*tg; do grep Sum $x/score_*/*.ctm.filt.sys | utils/best_wer.sh ; done +%WER 13.6 | 4459 42989 | 88.2 7.9 3.9 1.8 13.6 51.0 | exp/chain/tdnn_7b_sp/decode_eval2000_fsh_sw1_tg/score_8_0.0/eval2000_hires.ctm.filt.sys +%WER 12.1 | 4459 42989 | 89.7 6.8 3.5 1.8 12.1 50.2 | exp/chain/blstm_6h_sp/decode_eval2000_fsh_sw1_tg/score_7_0.0/eval2000_hires.ctm.filt.sys +# rescore with four-gram +for x in exp/chain/*/decode_eval2000*fg; do grep Sum $x/score_*/*.ctm.filt.sys | utils/best_wer.sh ; done +%WER 13.3 | 4459 42989 | 88.4 7.8 3.8 1.8 13.3 50.1 | exp/chain/tdnn_7b_sp/decode_eval2000_fsh_sw1_fg/score_8_0.0/eval2000_hires.ctm.filt.sys +%WER 12.0 | 4459 42989 | 89.6 6.5 3.8 1.7 12.0 49.3 | exp/chain/blstm_6h_sp/decode_eval2000_fsh_sw1_fg/score_8_0.5/eval2000_hires.ctm.filt.sys + +# chain result on eval2000 for swbd subset +# use tri-gram +for x in exp/chain/*/decode_eval2000*tg; do grep Sum $x/score_*/*.ctm.swbd.filt.sys | utils/best_wer.sh ; done +%WER 9.4 | 1831 21395 | 91.7 5.4 2.9 1.2 9.4 43.9 | exp/chain/tdnn_7b_sp/decode_eval2000_fsh_sw1_tg/score_10_0.5/eval2000_hires.ctm.swbd.filt.sys +%WER 8.8 | 1831 21395 | 92.5 5.3 2.2 1.4 8.8 46.9 | exp/chain/blstm_6h_sp/decode_eval2000_fsh_sw1_tg/score_7_1.0/eval2000_hires.ctm.swbd.filt.sys +# rescore with four-gram +for x in exp/chain/*/decode_eval2000*fg; do grep Sum $x/score_*/*.ctm.swbd.filt.sys | utils/best_wer.sh ; done +%WER 9.2 | 1831 21395 | 92.1 5.6 2.3 1.3 9.2 42.4 | exp/chain/tdnn_7b_relu_sp/decode_eval2000_fsh_sw1_fg/score_9_0.0/eval2000_hires.ctm.swbd.filt.sys +%WER 8.5 | 1831 21395 | 92.6 4.9 2.4 1.2 8.5 44.1 | exp/chain/blstm_6h_sp/decode_eval2000_fsh_sw1_fg/score_9_1.0/eval2000_hires.ctm.swbd.filt.sys +# chain result on eval2000 for callhm subset +# use tri-gram +for x in exp/chain/*/decode_eval2000*tg; do grep Sum $x/score_*/*.ctm.callhm.filt.sys | utils/best_wer.sh ; done +%WER 17.4 | 2628 21594 | 84.7 9.8 5.5 2.1 17.4 55.3 | exp/chain/tdnn_7b_relu_sp/decode_eval2000_fsh_sw1_tg/score_8_0.0/eval2000_hires.ctm.callhm.filt.sys +%WER 15.3 | 2628 21594 | 86.9 8.3 4.8 2.2 15.3 52.4 | exp/chain/blstm_6h_sp/decode_eval2000_fsh_sw1_tg/score_7_0.0/eval2000_hires.ctm.callhm.filt.sys +# rescore with four-gram +for x in exp/chain/*/decode_eval2000*fg; do grep Sum $x/score_*/*.ctm.callhm.filt.sys | utils/best_wer.sh ; done +%WER 17.3 | 2628 21594 | 84.9 9.7 5.5 2.1 17.3 55.0 | exp/chain/tdnn_7b_relu_sp/decode_eval2000_fsh_sw1_fg/score_8_0.0/eval2000_hires.ctm.callhm.filt.sys +%WER 15.3 | 2628 21594 | 87.0 8.6 4.4 2.4 15.3 52.1 | exp/chain/blstm_6h_sp/decode_eval2000_fsh_sw1_fg/score_6_0.5/eval2000_hires.ctm.callhm.filt.sys # GMM and SGMM numbers reported on rt03 for x in exp/*/decode_rt03*; do grep Sum $x/score_*/*.ctm.filt.sys | utils/best_wer.sh ; done @@ -89,3 +158,69 @@ for x in exp/nnet2_online/nnet_ms_a_online/decode_rt03*_fg; do grep Sum $x/scor %WER 20.2 | 3970 36721 | 88.3 8.1 3.6 8.5 20.2 74.3 | exp/nnet2_online/nnet_ms_a_online/decode_rt03_utt_fsh_sw1_fg/score_11/rt03.ctm.swbd.filt.sys %WER 19.1 | 3970 36721 | 88.8 7.8 3.4 7.9 19.1 72.2 | exp/nnet2_online/nnet_ms_a_online/decode_rt03_utt_offline_fsh_sw1_fg/score_11/rt03.ctm.swbd.filt.sys +# nnet3 result on rt03 +# use tri-gram +for x in exp/nnet3/*/decode_rt03*tg; do grep Sum $x/score_*/*.ctm.filt.sys | utils/best_wer.sh ; done +%WER 14.7 | 8420 76157 | 86.8 8.9 4.3 1.5 14.7 45.9 | exp/nnet3/tdnn_sp/decode_rt03_fsh_sw1_tg/score_11_0.0/rt03_hires.ctm.filt.sys +%WER 13.6 | 8420 76157 | 87.9 8.4 3.8 1.5 13.6 44.4 | exp/nnet3/tdnn_sp_smbr/decode_rt03_fsh_sw1_tg_epoch2.adj/score_18_1.0/rt03_hires.ctm.filt.sys +%WER 14.2 | 8420 76157 | 87.0 8.7 4.3 1.2 14.2 46.9 | exp/nnet3/lstm_bidirectional_sp/decode_rt03_fsh_sw1_tg/score_8_0.0/rt03_hires.ctm.filt.sys +# rescore with four-gram +for x in exp/nnet3/*/decode_rt03*fg; do grep Sum $x/score_*/*.ctm.filt.sys | utils/best_wer.sh ; done +%WER 14.4 | 8420 76157 | 87.1 8.8 4.2 1.5 14.4 45.2 | exp/nnet3/tdnn_sp/decode_rt03_fsh_sw1_fg/score_11_0.0/rt03_hires.ctm.filt.sys +%WER 13.4 | 8420 76157 | 88.2 8.4 3.4 1.6 13.4 43.6 | exp/nnet3/tdnn_sp_smbr/decode_rt03_fsh_sw1_fg_epoch2.adj/score_16_0.0/rt03_hires.ctm.filt.sys +%WER 13.9 | 8420 76157 | 87.2 8.4 4.3 1.2 13.9 46.0 | exp/nnet3/lstm_bidirectional_sp/decode_rt03_fsh_sw1_fg/score_9_0.0/rt03_hires.ctm.filt.sys + +# nnet3 result on rt03 for swbd subset +# use tri-gram +for x in exp/nnet3/*/decode_rt03*tg; do grep Sum $x/score_*/*.ctm.swbd.filt.sys | utils/best_wer.sh ; done +%WER 17.4 | 4450 39436 | 84.3 10.6 5.1 1.8 17.4 48.9 | exp/nnet3/tdnn_sp/decode_rt03_fsh_sw1_tg/score_11_0.5/rt03_hires.ctm.swbd.filt.sys +%WER 16.1 | 4450 39436 | 85.7 9.9 4.4 1.8 16.1 47.2 | exp/nnet3/tdnn_sp_smbr/decode_rt03_fsh_sw1_tg_epoch2.adj/score_18_0.5/rt03_hires.ctm.swbd.filt.sys +%WER 16.6 | 4450 39436 | 84.7 10.0 5.3 1.3 16.6 49.6 | exp/nnet3/lstm_bidirectional_sp/decode_rt03_fsh_sw1_tg/score_10_0.5/rt03_hires.ctm.swbd.filt.sys +# rescore with four-gram +for x in exp/nnet3/*/decode_rt03*fg; do grep Sum $x/score_*/*.ctm.swbd.filt.sys | utils/best_wer.sh ; done +%WER 17.1 | 4450 39436 | 84.6 10.3 5.1 1.8 17.1 48.2 | exp/nnet3/tdnn_sp/decode_rt03_fsh_sw1_fg/score_12_0.0/rt03_hires.ctm.swbd.filt.sys +%WER 15.9 | 4450 39436 | 85.9 9.9 4.2 1.8 15.9 46.7 | exp/nnet3/tdnn_sp_smbr/decode_rt03_fsh_sw1_fg_epoch2.adj/score_18_0.0/rt03_hires.ctm.swbd.filt.sys +%WER 16.3 | 4450 39436 | 85.0 9.8 5.1 1.3 16.3 49.0 | exp/nnet3/lstm_bidirectional_sp/decode_rt03_fsh_sw1_fg/score_10_0.0/rt03_hires.ctm.swbd.filt.sys + +# nnet3 result on rt03 for fsh subset +# use tri-gram +for x in exp/nnet3/*/decode_rt03*tg; do grep Sum $x/score_*/*.ctm.fsh.filt.sys | utils/best_wer.sh ; done +%WER 11.8 | 3970 36721 | 89.4 7.2 3.5 1.2 11.8 42.5 | exp/nnet3/tdnn_sp/decode_rt03_fsh_sw1_tg/score_11_0.0/rt03_hires.ctm.fsh.filt.sys +%WER 10.9 | 3970 36721 | 90.4 6.8 2.7 1.3 10.9 40.6 | exp/nnet3/tdnn_sp_smbr/decode_rt03_fsh_sw1_tg_epoch2.adj/score_15_0.0/rt03_hires.ctm.fsh.filt.sys +%WER 11.6 | 3970 36721 | 89.4 7.1 3.5 1.0 11.6 43.6 | exp/nnet3/lstm_bidirectional_sp/decode_rt03_fsh_sw1_tg/score_7_0.0/rt03_hires.ctm.fsh.filt.sys +# rescore with four-gram +for x in exp/nnet3/*/decode_rt03*fg; do grep Sum $x/score_*/*.ctm.fsh.filt.sys | utils/best_wer.sh ; done +%WER 11.4 | 3970 36721 | 89.7 6.9 3.4 1.1 11.4 41.5 | exp/nnet3/tdnn_sp/decode_rt03_fsh_sw1_fg/score_11_0.0/rt03_hires.ctm.fsh.filt.sys +%WER 10.6 | 3970 36721 | 90.7 6.6 2.7 1.3 10.6 39.8 | exp/nnet3/tdnn_sp_smbr/decode_rt03_fsh_sw1_fg_epoch2.adj/score_15_1.0/rt03_hires.ctm.fsh.filt.sys +%WER 11.4 | 3970 36721 | 89.5 6.7 3.8 1.0 11.4 42.6 | exp/nnet3/lstm_bidirectional_sp/decode_rt03_fsh_sw1_fg/score_10_0.0/rt03_hires.ctm.fsh.filt.sys + +# chain result on rt03 +# BLSTM ran for about 380 hours +# use tri-gram +for x in exp/chain/*/decode_rt03*tg; do grep Sum $x/score_*/*.ctm.filt.sys | utils/best_wer.sh ; done +%WER 12.7 | 8420 76157 | 88.5 7.2 4.2 1.3 12.7 43.2 | exp/chain/tdnn_7b_sp/decode_rt03_fsh_sw1_tg/score_9_0.0/rt03_hires.ctm.filt.sys +%WER 11.7 | 8420 76157 | 89.8 6.6 3.6 1.5 11.7 43.7 | exp/chain/blstm_6h_sp/decode_rt03_fsh_sw1_tg/score_7_0.0/rt03_hires.ctm.filt.sys +# rescore with four-gram +for x in exp/chain/*/decode_rt03*fg; do grep Sum $x/score_*/*.ctm.filt.sys | utils/best_wer.sh ; done +%WER 12.4 | 8420 76157 | 88.9 7.0 4.1 1.3 12.4 42.7 | exp/chain/tdnn_7b_sp/decode_rt03_fsh_sw1_fg/score_9_0.0/rt03_hires.ctm.filt.sys +%WER 11.4 | 8420 76157 | 89.9 6.1 3.9 1.3 11.4 43.4 | exp/chain/blstm_6h_sp/decode_rt03_fsh_sw1_fg/score_8_0.0/rt03_hires.ctm.filt.sys + +# chain result on rt03 for swbd subset +# use tri-gram +for x in exp/chain/*/decode_rt03*tg; do grep Sum $x/score_*/*.ctm.swbd.filt.sys | utils/best_wer.sh ; done +%WER 15.0 | 4450 39436 | 86.4 8.6 5.0 1.4 15.0 45.8 | exp/chain/tdnn_7b_sp/decode_rt03_fsh_sw1_tg/score_9_0.0/rt03_hires.ctm.swbd.filt.sys +%WER 13.3 | 4450 39436 | 88.3 7.5 4.2 1.6 13.3 45.2 | exp/chain/blstm_6h_sp/decode_rt03_fsh_sw1_tg/score_8_0.0/rt03_hires.ctm.swbd.filt.sys +# rescore with four-gram +for x in exp/chain/*/decode_rt03*fg; do grep Sum $x/score_*/*.ctm.swbd.filt.sys | utils/best_wer.sh ; done +%WER 14.8 | 4450 39436 | 86.5 8.0 5.5 1.3 14.8 45.5 | exp/chain/tdnn_7b_sp/decode_rt03_fsh_sw1_fg/score_10_0.0/rt03_hires.ctm.swbd.filt.sys +%WER 13.0 | 4450 39436 | 88.5 7.3 4.2 1.6 13.0 44.8 | exp/chain/blstm_6h_sp/decode_rt03_fsh_sw1_fg/score_8_0.0/rt03_hires.ctm.swbd.filt.sys + +# chain result on rt03 for fsh subset +# use tri-gram +for x in exp/chain/*/decode_rt03*tg; do grep Sum $x/score_*/*.ctm.fsh.filt.sys | utils/best_wer.sh ; done +%WER 10.2 | 3970 36721 | 91.1 6.0 3.0 1.2 10.2 40.2 | exp/chain/tdnn_7b_relu_sp/decode_rt03_fsh_sw1_tg/score_8_0.0/rt03_hires.ctm.fsh.filt.sys +%WER 9.8 | 3970 36721 | 91.4 5.3 3.3 1.2 9.8 42.0 | exp/chain/blstm_6h_sp/decode_rt03_fsh_sw1_tg/score_7_0.0/rt03_hires.ctm.fsh.filt.sys +# rescore with four-gram +for x in exp/chain/*/decode_rt03*fg; do grep Sum $x/score_*/*.ctm.fsh.filt.sys | utils/best_wer.sh ; done +%WER 9.8 | 3970 36721 | 91.4 5.8 2.8 1.2 9.8 39.6 | exp/chain/tdnn_7b_relu_sp/decode_rt03_fsh_sw1_fg/score_8_0.0/rt03_hires.ctm.fsh.filt.sys +%WER 9.6 | 3970 36721 | 91.6 5.2 3.3 1.2 9.6 41.4 | exp/chain/blstm_6h_sp/decode_rt03_fsh_sw1_fg/score_7_0.0/rt03_hires.ctm.fsh.filt.sys diff --git a/egs/fisher_swbd/s5/cmd.sh b/egs/fisher_swbd/s5/cmd.sh index e3294fde05a..88db78823a5 100644 --- a/egs/fisher_swbd/s5/cmd.sh +++ b/egs/fisher_swbd/s5/cmd.sh @@ -1,32 +1,15 @@ -# "queue.pl" uses qsub. The options to it are -# options to qsub. If you have GridEngine installed, -# change this to a queue you have access to. -# Otherwise, use "run.pl", which will run jobs locally -# (make sure your --num-jobs options are no more than -# the number of cpus on your machine. - -#a) JHU cluster options -export train_cmd="queue.pl -l arch=*64*" -export decode_cmd="queue.pl -l arch=*64* -l ram_free=4G,mem_free=4G" -export cuda_cmd="..." -export mkgraph_cmd="queue.pl -l arch=*64* ram_free=4G,mem_free=4G" - -#b) BUT cluster options -#export train_cmd="queue.pl -q all.q@@blade -l ram_free=1200M,mem_free=1200M" -#export decode_cmd="queue.pl -q all.q@@blade -l ram_free=1700M,mem_free=1700M" -#export decodebig_cmd="queue.pl -q all.q@@blade -l ram_free=4G,mem_free=4G" -#export cuda_cmd="queue.pl -q long.q@@pco203 -l gpu=1" -#export cuda_cmd="queue.pl -q long.q@pcspeech-gpu" -#export mkgraph_cmd="queue.pl -q all.q@@servers -l ram_free=4G,mem_free=4G" - -#c) run it locally... -#export train_cmd=run.pl -#export decode_cmd=run.pl -#export cuda_cmd=run.pl -#export mkgraph_cmd=run.pl - -#d) Gorgon cluster -#export train_cmd="gorgon_queue.pl -q gorgon" -#export decode_cmd="gorgon_queue.pl -q gorgon" -#export cuda_cmd="gorgon_queue.pl -q gorgon" -#export mkgraph_cmd="gorgon_queue.pl -q gorgon" +# you can change cmd.sh depending on what type of queue you are using. +# If you have no queueing system and want to run on a local machine, you +# can change all instances 'queue.pl' to run.pl (but be careful and run +# commands one by one: most recipes will exhaust the memory on your +# machine). queue.pl works with GridEngine (qsub). slurm.pl works +# with slurm. Different queues are configured differently, with different +# queue names and different ways of specifying things like memory; +# to account for these differences you can create and edit the file +# conf/queue.conf to match your queue's configuration. Search for +# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, +# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. + +export train_cmd="queue.pl --mem 4G" +export decode_cmd="queue.pl --mem 4G" +export mkgraph_cmd="queue.pl --mem 8G" diff --git a/egs/fisher_swbd/s5/conf/MSU_single_letter.txt b/egs/fisher_swbd/s5/conf/MSU_single_letter.txt new file mode 100644 index 00000000000..1f7b419cca7 --- /dev/null +++ b/egs/fisher_swbd/s5/conf/MSU_single_letter.txt @@ -0,0 +1,26 @@ +A ey +B b iy +C s iy +D d iy +E iy +F eh f +G jh iy +H ey ch +I ay +J jh ey +K k ey +L eh l +M eh m +N eh n +O ow +P p iy +Q k y uw +R aa r +S eh s +T t iy +U y uw +V v iy +W d ah b ax l y uw +X eh k s +Y w ay +Z z iy diff --git a/egs/fisher_swbd/s5/local/chain/run_blstm_6h.sh b/egs/fisher_swbd/s5/local/chain/run_blstm_6h.sh new file mode 100644 index 00000000000..b70da4e852a --- /dev/null +++ b/egs/fisher_swbd/s5/local/chain/run_blstm_6h.sh @@ -0,0 +1,181 @@ +#!/bin/bash + +# based on run_tdnn_6h.sh + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +dir=exp/chain/blstm_6h +decode_iter= +decode_dir_affix= + +# training options +num_epochs=4 +remove_egs=false +common_egs_dir= +affix= +chunk_width=150 +chunk_left_context=40 +chunk_right_context=40 + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate -1 \ + --cmd "$train_cmd" 11000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + echo "$0: creating neural net configs"; + + steps/nnet3/lstm/make_configs.py \ + --feat-dir data/${train_set}_hires \ + --ivector-dir exp/nnet3/ivectors_${train_set} \ + --tree-dir $treedir \ + --splice-indexes="-2,-1,0,1,2 0 0" \ + --lstm-delay=" [-3,3] [-3,3] [-3,3] " \ + --xent-regularize 0.1 \ + --include-log-softmax false \ + --num-lstm-layers 3 \ + --cell-dim 1024 \ + --hidden-dim 1024 \ + --recurrent-projection-dim 256 \ + --non-recurrent-projection-dim 256 \ + --label-delay 0 \ + --self-repair-scale 0.00001 \ + $dir/configs || exit 1; + +fi + +if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize 0.1 \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --chain.left-deriv-truncate 0 \ + --trainer.num-chunk-per-minibatch 64 \ + --trainer.frames-per-iter 1200000 \ + --trainer.max-param-change 1.414 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.shrink-value 0.99 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.optimization.momentum 0.0 \ + --egs.stage $get_egs_stage \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width $chunk_width \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --egs.dir "$common_egs_dir" \ + --cleanup.remove-egs $remove_egs \ + --feat-dir data/${train_set}_hires \ + --tree-dir $treedir \ + --lat-dir exp/tri5a_lats_nodup_sp \ + --dir $dir || exit 1; +fi + +if [ $stage -le 14 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_fsh_sw1_tg $dir $dir/graph_fsh_sw1_tg +fi + +decode_suff=fsh_sw1_tg +graph_dir=$dir/graph_fsh_sw1_tg +if [ $stage -le 15 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + iter_opts=" --iter $decode_iter " + fi + + # decoding options + extra_left_context=$[$chunk_left_context+10] + extra_right_context=$[$chunk_right_context+10] + + for decode_set in eval2000 rt03; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --frames-per-chunk $chunk_width \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_dir_affix:+_$decode_dir_affix}_${decode_suff} || exit 1; + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_fsh_sw1_{tg,fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}${decode_dir_affix:+_$decode_dir_affix}_fsh_sw1_{tg,fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/fisher_swbd/s5/local/chain/run_tdnn_7b.sh b/egs/fisher_swbd/s5/local/chain/run_tdnn_7b.sh new file mode 100644 index 00000000000..d0e1093bf93 --- /dev/null +++ b/egs/fisher_swbd/s5/local/chain/run_tdnn_7b.sh @@ -0,0 +1,162 @@ +#!/bin/bash + +set -e + +# based on run_tdnn_7b.sh in the swbd recipe + +# configs for 'chain' +affix= +stage=12 +train_stage=-10 +get_egs_stage=-10 +dir=exp/chain/tdnn_7b +decode_iter= + +# training options +num_epochs=4 +remove_egs=false +common_egs_dir= + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate -1 \ + --cmd "$train_cmd" 11000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + echo "$0: creating neural net configs"; + + # create the config files for nnet initialization + steps/nnet3/tdnn/make_configs.py \ + --self-repair-scale 0.00001 \ + --feat-dir data/${train_set}_hires \ + --ivector-dir exp/nnet3/ivectors_${train_set} \ + --tree-dir $treedir \ + --relu-dim 725 \ + --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3 -3,0,3 -6,-3,0 0" \ + --use-presoftmax-prior-scale false \ + --xent-regularize 0.1 \ + --xent-separate-forward-affine true \ + --include-log-softmax false \ + --final-layer-normalize-target 0.5 \ + $dir/configs || exit 1; +fi + +if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir "$common_egs_dir" \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize 0.1 \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.stage $get_egs_stage \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch 128 \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs $remove_egs \ + --feat-dir data/${train_set}_hires \ + --tree-dir $treedir \ + --lat-dir exp/tri5a_lats_nodup_sp \ + --dir $dir || exit 1; +fi + +if [ $stage -le 14 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_fsh_sw1_tg $dir $dir/graph_fsh_sw1_tg +fi + +decode_suff=fsh_sw1_tg +graph_dir=$dir/graph_fsh_sw1_tg +if [ $stage -le 15 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + iter_opts=" --iter $decode_iter " + fi + for decode_set in eval2000 rt03; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $num_jobs --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_${decode_suff} || exit 1; + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_fsh_sw1_{tg,fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_fsh_sw1_{tg,fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/fisher_swbd/s5/local/dict.patch b/egs/fisher_swbd/s5/local/dict.patch new file mode 100644 index 00000000000..7fcaa98b4f5 --- /dev/null +++ b/egs/fisher_swbd/s5/local/dict.patch @@ -0,0 +1,378 @@ +8645a8646 +> uh-hum ah m hh ah m +9006c9007 +< April ey p r ih l +--- +> April ey p r ax l +9144d9144 +< B ay zh aa n iy z +9261c9261 +< Battle b ae t el +--- +> Battle b ae t ax l +10014a10015 +> Chevy sh eh v iy +10211a10213 +> Colorado k ao l ax r aa d ow +10212a10215 +> Colorado' k ao l ax r aa d ow z +10370c10373 +< Creek k r ih k +--- +> Creek k r iy k +10889a10893 +> Eleven ax l eh v ih n +10951c10955 +< Erie ih r iy +--- +> Erie iy r iy +11183c11187 +< Forever f ax r eh v er +--- +> Forever f er eh v er +11231a11236 +> Friday f r ay d iy +11744a11750 +> History hh ih s t r iy +12004a12011,12012 +> Israel ih z r ih l +> Israel's ih z r ih l z +12573a12582 +> Lincoln l ih ng k ih n +12574a12584 +> Lincolns l ih ng k ih n z +13268c13278 +< NAACP eh ey ey s iy p iy +--- +> NAACP eh n ey ey s iy p iy +13286c13296 +< NIT eh ay t iy +--- +> NIT eh n ay t iy +13292c13302 +< NTSC eh t iy eh s s iy +--- +> NTSC eh n t iy eh s s iy +14058a14069 +> Quarter k ow r t er +14059a14071 +> Quarterback k ow r t er b ae k +14060a14073 +> Quarters k ow r t er z +14569a14583 +> Science s ay n s +15087a15102 +> Sunday s ah n d iy +15088a15104 +> Sunday's s ah n d iy z +15089a15106 +> Sundays s ah n d iy z +15290,15291c15307,15308 +< Texan t eh k sh ih n +< Texan's t eh k sh ih n s +--- +> Texan t eh k s ih n +> Texan's t eh k s ih n s +15335a15353 +> Thousands th aw z ih n z +15739c15757 +< Waco w ae k ow +--- +> Waco w ey k ow +15841a15860 +> Weekends w iy k eh n z +16782a16802 +> acceptable eh k s eh p ax b ax l +16833a16854 +> accounting ax k aw n ih ng +16948a16970 +> address ax d r eh s +17281a17304 +> already aa r d iy +17315a17339 +> am m +17709a17734 +> asked ae s t +17847a17873 +> attorney ih t er n iy +17919a17946 +> autopilot ao t ow p ay l ih t +17960a17988 +> awfully ao f l iy +18221a18250 +> basketball b ae s k ax b ao l +18222a18252 +> basketball's b ae s k ax b ao l z +18302a18333 +> become b ah k ah m +18303a18335 +> becomes b iy k ah m z +18344a18377 +> began b ax g en n +18817c18850 +< bottle b aa t el +--- +> bottle b aa t ax l +19332,19333c19365,19367 +< camera's k ae m ax r ax z +< cameras k ae m ax r ax z +--- +> camera k ae m r ax +> camera's k ae m r ax z +> cameras k ae m r ax z +19411a19446 +> capital k ae p ax l +19505a19541 +> carrying k ae r ih ng +20316a20353,20354 +> combination k aa m ih n ey sh ih n +> combinations k aa m ih n ey sh ih n z +20831a20870 +> contracts k aa n t r ae k s +21010a21050 +> costs k ao s +21062a21103 +> county k aw n iy +21371a21413 +> cultural k ao l ch ax r ax l +21372a21415 +> culturally k ao l ch ax r ax l iy +21373a21417 +> culture k ao l ch er +21375a21420 +> cultures k ao l ch er z +21543a21589 +> data d ey t ax +22097a22144 +> differently d ih f ax r ih n t l iy +22972a23020 +> effects ax f eh k t s +23016a23065 +> election ax l eh k sh ih n +23018a23068 +> elections ax l eh k sh ih n z +23052a23103 +> eleven ax l eh v ih n +23242a23294 +> enjoyable ae n jh oy ax b ax l +23248a23301 +> enjoys ae n jh oy z +23293a23347 +> entire ih n t ay r +23295a23350,23351 +> entirely ih n t ay r l iy +> entirety ih n t ay r t iy +23745a23802 +> extra eh k s t er +23818a23876 +> facts f ae k s +24508c24566 +< forever f ax r eh v er +--- +> forever f er eh v er +24514c24572 +< forget f ow r g eh t +--- +> forget f er r g eh t +24521a24580 +> forgot f er r g aa t +24522a24582 +> forgotten f er r g aa t ax n +24563a24624 +> forward f ow er d +24680a24742 +> frightening f r ay t n ih ng +24742a24805 +> full-time f ax l t ay m +24862a24926 +> garage g r aa jh +25218a25283 +> grandmother g r ae m ah dh er +25790a25856 +> heavily hh eh v ax l iy +25949a26016 +> history hh ih s t r iy +26038a26106 +> honestly aa n ax s t l iy +26039a26108 +> honesty aa n ax s t iy +26099a26169 +> horror hh ow r +26155a26226 +> houses hh aw z ih z +26184c26255 +< huh-uh hh ah hh ah +--- +> huh-uh ah hh ah +26189c26260 +< hum-um hh m hh m +--- +> hum-um ah m hh ah m +26236a26308 +> hunting hh ah n ih ng +26307a26380,26381 +> ideal ay d iy l +> idealist ay d iy l ih s t +26369a26444 +> imagine m ae jh ih n +26628a26704 +> individuals ih n d ih v ih jh ax l z +26968a27045 +> interest ih n t r ih s t +27184a27262 +> it'd ih d +27702a27781 +> lead l iy d +28378a28458 +> mandatory m ae n d ih t ow r iy +28885a28966 +> minute m ih n ih t +29167a29249 +> mountains m aw t n z +29317a29400 +> mysteries m ih s t r iy z +29318a29402 +> mystery m ih s t r iy +29470a29555 +> nervous n er v ih s +29578,29580c29663,29665 +< nobody n ow b aa d iy +< nobody'll n ow b aa d iy l +< nobody's n ow b aa d iy z +--- +> nobody n ow b ah d iy +> nobody'll n ow b ah d iy l +> nobody's n ow b ah d iy z +29712a29798 +> nuclear n uw k l iy r +29938a30025 +> onto aa n t ax +30051a30139 +> originally ax r ih jh ax l iy +30507a30596 +> particularly p er t ih k y ax l iy +30755a30845 +> perfectly p er f ih k l iy +30820a30911 +> personally p er s n ax l iy +30915a31007 +> physically f ih z ih k l iy +30986a31079 +> pilot p ay l ih t +30987a31081 +> pilot's p ay l ih t s +31227a31322 +> police p l iy s +31513a31609 +> prefer p er f er +31553a31650 +> prepare p r ax p ey r +31578a31676 +> prescription p er s k r ih p sh ih n +31579a31678 +> prescriptions p er s k r ih p sh ih n z +31770a31870 +> products p r aa d ax k s +31821a31922 +> projects p r aa jh eh k s +31908a32010 +> protect p er t eh k t +31909a32012 +> protected p er t eh k t ih d +31911a32015 +> protection p er t eh k sh ih n +31914a32019 +> protection p er t eh k t ih v +32149a32255 +> quarter k ow r t er +32414a32521 +> read r iy d +32785a32893 +> rehabilitation r iy ax b ih l ih t ey sh ih n +33150a33259 +> resource r ih s ow r s +33151a33261 +> resources r iy s ow r s ih z +33539c33649 +< roots r uh t s +--- +> roots r uw t s +33929a34040 +> science s ay n s +34315a34427 +> seventy s eh v ih n iy +34319,34320c34431,34432 +< severe s ax v iy r +< severely s ax v iy r l iy +--- +> severe s ih v iy r +> severely s ih v iy r l iy +35060a35173 +> software s ao f w ey r +35083a35197 +> solid s ao l ih d +35084a35199 +> solidly s ao l ih d l iy +35750a35866 +> stood s t ih d +35854a35971 +> strictly s t r ih k l iy +35889c36006 +< stronger s t r ao ng er +--- +> stronger s t r ao ng g er +36192a36310,36311 +> supposed s p ow z +> supposed s p ow s +36510a36630 +> tastes t ey s +36856a36977 +> thoroughly th er r l iy +36866a36988 +> thousands th aw z ih n z +37081c37203 +< toots t uh t s +--- +> toots t uw t s +37157a37280 +> toward t w ow r d +37158a37282 +> towards t w ow r d z +37564a37689 +> twenties t w eh n iy z +37565a37691 +> twentieth t w eh n iy ih th +37637a37764 +> unacceptable ah n ae k s eh p ax b ax l +37728a37856 +> understand ah n d er s t ae n +37860a37989 +> unless ih n l eh s +38040a38170 +> use y uw z +38049a38180 +> uses y uw z ih z +38125a38257 +> various v ah r iy ih s +38202a38335 +> versus v er s ih z +38381c38514 +< wacko w ae k ow +--- +> wacko w ey k ow +38455c38588 +< wanna w aa n ax +--- +> wanna w ah n ax +38675c38808 +< whatnot w ah t n aa t +--- +> whatnot w aa t n aa t +38676a38810 +> whatsoever w aa t s ow eh v er +38890c39024 +< wok w aa k +--- +> wok w ao k +38910a39045 +> wondering w ah n d r ih ng diff --git a/egs/fisher_swbd/s5/local/fisher_create_test_lang.sh b/egs/fisher_swbd/s5/local/fisher_create_test_lang.sh index aaa45f8e4e1..1d7c4013b83 100755 --- a/egs/fisher_swbd/s5/local/fisher_create_test_lang.sh +++ b/egs/fisher_swbd/s5/local/fisher_create_test_lang.sh @@ -1,4 +1,4 @@ -#!/bin/bash +#!/bin/bash # if [ -f path.sh ]; then . path.sh; fi @@ -11,26 +11,13 @@ arpa_lm=data/local/lm/3gram-mincount/lm_unpruned.gz mkdir -p data/lang_test cp -r data/lang/* data/lang_test -# grep -v ' ' etc. is only for future-proofing this script. Our -# LM doesn't have these "invalid combinations". These can cause -# determinization failures of CLG [ends up being epsilon cycles]. -# Note: remove_oovs.pl takes a list of words in the LM that aren't in -# our word list. Since our LM doesn't have any, we just give it -# /dev/null [we leave it in the script to show how you'd do it]. gunzip -c "$arpa_lm" | \ - grep -v ' ' | \ - grep -v ' ' | \ - grep -v ' ' | \ - arpa2fst - | fstprint | \ - utils/remove_oovs.pl /dev/null | \ - utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=data/lang_test/words.txt \ - --osymbols=data/lang_test/words.txt --keep_isymbols=false --keep_osymbols=false | \ - fstrmepsilon | fstarcsort --sort_type=ilabel > data/lang_test/G.fst - fstisstochastic data/lang_test/G.fst + arpa2fst --disambig-symbol=#0 \ + --read-symbol-table=data/lang_test/words.txt - data/lang_test/G.fst echo "Checking how stochastic G is (the first of these numbers should be small):" -fstisstochastic data/lang_test/G.fst +fstisstochastic data/lang_test/G.fst ## Check lexicon. ## just have a look and make sure it seems sane. @@ -59,4 +46,3 @@ fsttablecompose data/lang/L_disambig.fst data/lang_test/G.fst | \ echo "$0 succeeded" - diff --git a/egs/fisher_swbd/s5/local/fisher_create_test_lang_fsh.sh b/egs/fisher_swbd/s5/local/fisher_create_test_lang_fsh.sh index 246ef1b888f..fb07544a92a 100755 --- a/egs/fisher_swbd/s5/local/fisher_create_test_lang_fsh.sh +++ b/egs/fisher_swbd/s5/local/fisher_create_test_lang_fsh.sh @@ -1,4 +1,4 @@ -#!/bin/bash +#!/bin/bash # if [ -f path.sh ]; then . path.sh; fi @@ -11,26 +11,13 @@ arpa_lm=data/local/lm/3gram-mincount/lm_unpruned.gz mkdir -p data/lang_test_fsh cp -r data/lang/* data/lang_test_fsh -# grep -v ' ' etc. is only for future-proofing this script. Our -# LM doesn't have these "invalid combinations". These can cause -# determinization failures of CLG [ends up being epsilon cycles]. -# Note: remove_oovs.pl takes a list of words in the LM that aren't in -# our word list. Since our LM doesn't have any, we just give it -# /dev/null [we leave it in the script to show how you'd do it]. gunzip -c "$arpa_lm" | \ - grep -v ' ' | \ - grep -v ' ' | \ - grep -v ' ' | \ - arpa2fst - | fstprint | \ - utils/remove_oovs.pl /dev/null | \ - utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=data/lang_test_fsh/words.txt \ - --osymbols=data/lang_test_fsh/words.txt --keep_isymbols=false --keep_osymbols=false | \ - fstrmepsilon | fstarcsort --sort_type=ilabel > data/lang_test_fsh/G.fst - fstisstochastic data/lang_test_fsh/G.fst + arpa2fst --disambig-symbol=#0 \ + --read-symbol-table=data/lang_test/words.txt - data/lang_test/G.fst echo "Checking how stochastic G is (the first of these numbers should be small):" -fstisstochastic data/lang_test_fsh/G.fst +fstisstochastic data/lang_test_fsh/G.fst ## Check lexicon. ## just have a look and make sure it seems sane. @@ -59,4 +46,3 @@ fsttablecompose data/lang/L_disambig.fst data/lang_test_fsh/G.fst | \ echo "$0 succeeded" - diff --git a/egs/fisher_swbd/s5/local/fisher_train_lms.sh b/egs/fisher_swbd/s5/local/fisher_train_lms.sh index 5d8b9e2e18d..a9e3fa4566a 100755 --- a/egs/fisher_swbd/s5/local/fisher_train_lms.sh +++ b/egs/fisher_swbd/s5/local/fisher_train_lms.sh @@ -30,6 +30,7 @@ export PATH=$PATH:`pwd`/../../../tools/kaldi_lm else echo Downloading and installing the kaldi_lm tools if [ ! -f kaldi_lm.tar.gz ]; then + wget http://www.danielpovey.com/files/kaldi/kaldi_lm.tar.gz || wget http://merlin.fit.vutbr.cz/kaldi/kaldi_lm.tar.gz || exit 1; fi tar -xvzf kaldi_lm.tar.gz || exit 1; diff --git a/egs/fisher_swbd/s5/local/fisher_train_lms_fsh.sh b/egs/fisher_swbd/s5/local/fisher_train_lms_fsh.sh index ebc954b756b..3133af6ee1f 100755 --- a/egs/fisher_swbd/s5/local/fisher_train_lms_fsh.sh +++ b/egs/fisher_swbd/s5/local/fisher_train_lms_fsh.sh @@ -30,6 +30,7 @@ export PATH=$PATH:`pwd`/../../../tools/kaldi_lm else echo Downloading and installing the kaldi_lm tools if [ ! -f kaldi_lm.tar.gz ]; then + wget http://www.danielpovey.com/files/kaldi/kaldi_lm.tar.gz || wget http://merlin.fit.vutbr.cz/kaldi/kaldi_lm.tar.gz || exit 1; fi tar -xvzf kaldi_lm.tar.gz || exit 1; diff --git a/egs/fisher_swbd/s5/local/nnet3/run_ivector_common.sh b/egs/fisher_swbd/s5/local/nnet3/run_ivector_common.sh new file mode 100644 index 00000000000..4d083d61d0e --- /dev/null +++ b/egs/fisher_swbd/s5/local/nnet3/run_ivector_common.sh @@ -0,0 +1,141 @@ +#!/bin/bash + +. ./cmd.sh +set -e +stage=1 +train_stage=-10 +generate_alignments=true # false if doing chain training +speed_perturb=true + +. ./path.sh +. ./utils/parse_options.sh + +# perturbed data preparation +train_set=train_nodup +if [ "$speed_perturb" == "true" ]; then + if [ $stage -le 1 ]; then + #Although the nnet will be trained by high resolution data, we still have to perturbe the normal data to get the alignment + # _sp stands for speed-perturbed + + for datadir in train_nodup; do + utils/perturb_data_dir_speed.sh 0.9 data/${datadir} data/temp1 + utils/perturb_data_dir_speed.sh 1.1 data/${datadir} data/temp2 + utils/combine_data.sh data/${datadir}_tmp data/temp1 data/temp2 + utils/validate_data_dir.sh --no-feats data/${datadir}_tmp + rm -r data/temp1 data/temp2 + + mfccdir=mfcc_perturbed + steps/make_mfcc.sh --cmd "$train_cmd" --nj 50 \ + data/${datadir}_tmp exp/make_mfcc/${datadir}_tmp $mfccdir || exit 1; + steps/compute_cmvn_stats.sh data/${datadir}_tmp exp/make_mfcc/${datadir}_tmp $mfccdir || exit 1; + utils/fix_data_dir.sh data/${datadir}_tmp + + utils/copy_data_dir.sh --spk-prefix sp1.0- --utt-prefix sp1.0- data/${datadir} data/temp0 + utils/combine_data.sh data/${datadir}_sp data/${datadir}_tmp data/temp0 + utils/fix_data_dir.sh data/${datadir}_sp + rm -r data/temp0 data/${datadir}_tmp + done + fi + + if [ $stage -le 2 ] && [ "$generate_alignments" == "true" ]; then + #obtain the alignment of the perturbed data + steps/align_fmllr.sh --nj 100 --cmd "$train_cmd" \ + data/train_nodup_sp data/lang_nosp exp/tri5a exp/tri5a_ali_nodup_sp || exit 1 + fi + train_set=train_nodup_sp +fi + +if [ $stage -le 3 ]; then + mfccdir=mfcc_hires + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $mfccdir/storage ]; then + date=$(date +'%m_%d_%H_%M') + utils/create_split_dir.pl /export/b0{1,2,3,4}/$USER/kaldi-data/egs/fisher_swbd-$date/s5b/$mfccdir/storage $mfccdir/storage + fi + + # the 100k_nodup directory is copied seperately, as + # we want to use exp/tri1b_ali_100k_nodup for lda_mllt training + # the main train directory might be speed_perturbed + for dataset in $train_set train_100k_nodup; do + utils/copy_data_dir.sh data/$dataset data/${dataset}_hires + + # scale the waveforms, this is useful as we don't use CMVN + data_dir=data/${dataset}_hires + cat $data_dir/wav.scp | python -c " +import sys, os, subprocess, re, random +scale_low = 1.0/8 +scale_high = 2.0 +for line in sys.stdin.readlines(): + if len(line.strip()) == 0: + continue + print '{0} sox --vol {1} -t wav - -t wav - |'.format(line.strip(), random.uniform(scale_low, scale_high)) +"| sort -k1,1 -u > $data_dir/wav.scp_scaled || exit 1; + mv $data_dir/wav.scp_scaled $data_dir/wav.scp + + steps/make_mfcc.sh --nj 70 --mfcc-config conf/mfcc_hires.conf \ + --cmd "$train_cmd" data/${dataset}_hires exp/make_hires/$dataset $mfccdir; + steps/compute_cmvn_stats.sh data/${dataset}_hires exp/make_hires/${dataset} $mfccdir; + + # Remove the small number of utterances that couldn't be extracted for some + # reason (e.g. too short; no such file). + utils/fix_data_dir.sh data/${dataset}_hires; + done + + for dataset in eval2000 rt03; do + # Create MFCCs for the eval set + utils/copy_data_dir.sh data/$dataset data/${dataset}_hires + steps/make_mfcc.sh --cmd "$train_cmd" --nj 10 --mfcc-config conf/mfcc_hires.conf \ + data/${dataset}_hires exp/make_hires/$dataset $mfccdir; + steps/compute_cmvn_stats.sh data/${dataset}_hires exp/make_hires/$dataset $mfccdir; + utils/fix_data_dir.sh data/${dataset}_hires # remove segments with problems + done + + # Take the first 30k utterances (about 1/8th of the data) this will be used + # for the diagubm training + utils/subset_data_dir.sh --first data/${train_set}_hires 30000 data/${train_set}_30k_hires + local/remove_dup_utts.sh 200 data/${train_set}_30k_hires data/${train_set}_30k_nodup_hires # 33hr +fi + +# ivector extractor training +if [ $stage -le 5 ]; then + # We need to build a small system just because we need the LDA+MLLT transform + # to train the diag-UBM on top of. We use --num-iters 13 because after we get + # the transform (12th iter is the last), any further training is pointless. + # this decision is based on fisher_english + steps/train_lda_mllt.sh --cmd "$train_cmd" --num-iters 13 \ + --splice-opts "--left-context=3 --right-context=3" \ + 5500 90000 data/train_100k_nodup_hires \ + data/lang_nosp exp/tri1b_ali exp/nnet3/tri2b +fi + +if [ $stage -le 6 ]; then + # To train a diagonal UBM we don't need very much data, so use the smallest subset. + steps/online/nnet2/train_diag_ubm.sh --cmd "$train_cmd" --nj 30 --num-frames 200000 \ + data/${train_set}_30k_nodup_hires 512 exp/nnet3/tri2b exp/nnet3/diag_ubm +fi + +if [ $stage -le 7 ]; then + # iVector extractors can be sensitive to the amount of data, but this one has a + # fairly small dim (defaults to 100) so we don't use all of it, we use just the + # 100k subset (just under half the data). + steps/online/nnet2/train_ivector_extractor.sh --cmd "$train_cmd" --nj 10 \ + data/train_100k_nodup_hires exp/nnet3/diag_ubm exp/nnet3/extractor || exit 1; +fi + +if [ $stage -le 8 ]; then + # We extract iVectors on all the train_nodup data, which will be what we + # train the system on. + + # having a larger number of speakers is helpful for generalization, and to + # handle per-utterance decoding well (iVector starts at zero). + steps/online/nnet2/copy_data_dir.sh --utts-per-spk-max 2 data/${train_set}_hires data/${train_set}_max2_hires + + steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 30 \ + data/${train_set}_max2_hires exp/nnet3/extractor exp/nnet3/ivectors_$train_set || exit 1; + + for data_set in eval2000 rt03; do + steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 30 \ + data/${data_set}_hires exp/nnet3/extractor exp/nnet3/ivectors_$data_set || exit 1; + done +fi + +exit 0; diff --git a/egs/fisher_swbd/s5/local/nnet3/run_lstm.sh b/egs/fisher_swbd/s5/local/nnet3/run_lstm.sh new file mode 100644 index 00000000000..fec07fb2983 --- /dev/null +++ b/egs/fisher_swbd/s5/local/nnet3/run_lstm.sh @@ -0,0 +1,158 @@ +#!/bin/bash + +# Copyright 2015 Johns Hopkins University (Author: Daniel Povey). +# 2015 Vijayaditya Peddinti +# 2015 Xingyu Na +# 2015 Pegah Ghahrmani +# Apache 2.0. + + +# this is a basic lstm script +# LSTM script runs for more epochs than the TDNN script +# and each epoch takes twice the time + +# At this script level we don't support not running on GPU, as it would be painfully slow. +# If you want to run without GPU you'd have to call lstm/train.sh with --gpu false + +stage=0 +train_stage=-10 +affix= +common_egs_dir= +reporting_email= + +# LSTM options +splice_indexes="-2,-1,0,1,2 0 0" +lstm_delay=" -1 -2 -3 " +label_delay=5 +num_lstm_layers=3 +cell_dim=1024 +hidden_dim=1024 +recurrent_projection_dim=256 +non_recurrent_projection_dim=256 +chunk_width=20 +chunk_left_context=40 +chunk_right_context=0 + + +# training options +num_epochs=8 +initial_effective_lrate=0.0003 +final_effective_lrate=0.00003 +num_jobs_initial=3 +num_jobs_final=15 +momentum=0.5 +num_chunk_per_minibatch=100 +samples_per_iter=20000 +remove_egs=true + +#decode options +extra_left_context= +extra_right_context= +frames_per_chunk= + +# End configuration section. + +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <' $dir/score_LMWT_${wip}/stm.swbd '&&' \ - grep -v '^sw_' $dir/score_LMWT_${wip}/${name}.ctm '>' $dir/score_LMWT_${wip}/${name}.ctm.swbd '&&' \ + grep -v '^fsh_' $data/stm '>' $dir/score_LMWT_${wip}/stm.swbd '&&' \ + grep -v '^fsh_' $dir/score_LMWT_${wip}/${name}.ctm '>' $dir/score_LMWT_${wip}/${name}.ctm.swbd '&&' \ $hubscr -p $hubdir -V -l english -h hub5 -g $data/glm -r $dir/score_LMWT_${wip}/stm.swbd $dir/score_LMWT_${wip}/${name}.ctm.swbd || exit 1; done fi @@ -137,8 +137,8 @@ rt03* ) if [ $stage -le 3 ]; then for wip in $(echo $word_ins_penalty | sed 's/,/ /g'); do $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/score.fsh.LMWT.${wip}.log \ - grep -v '^fsh_' $data/stm '>' $dir/score_LMWT_${wip}/stm.fsh '&&' \ - grep -v '^fsh_' $dir/score_LMWT_${wip}/${name}.ctm '>' $dir/score_LMWT_${wip}/${name}.ctm.fsh '&&' \ + grep -v '^sw_' $data/stm '>' $dir/score_LMWT_${wip}/stm.fsh '&&' \ + grep -v '^sw_' $dir/score_LMWT_${wip}/${name}.ctm '>' $dir/score_LMWT_${wip}/${name}.ctm.fsh '&&' \ $hubscr -p $hubdir -V -l english -h hub5 -g $data/glm -r $dir/score_LMWT_${wip}/stm.fsh $dir/score_LMWT_${wip}/${name}.ctm.fsh || exit 1; done fi diff --git a/egs/fisher_swbd/s5/local/swbd1_data_download.sh b/egs/fisher_swbd/s5/local/swbd1_data_download.sh new file mode 100755 index 00000000000..95c9d5e58a4 --- /dev/null +++ b/egs/fisher_swbd/s5/local/swbd1_data_download.sh @@ -0,0 +1,47 @@ +#!/bin/bash + +# Switchboard-1 training data preparation customized for Edinburgh +# Author: Arnab Ghoshal (Jan 2013) + +# To be run from one directory above this script. + +## The input is some directory containing the switchboard-1 release 2 +## corpus (LDC97S62). Note: we don't make many assumptions about how +## you unpacked this. We are just doing a "find" command to locate +## the .sph files. + +. path.sh + +#check existing directories +if [ $# != 1 ]; then + echo "Usage: swbd1_data_download.sh /path/to/SWBD" + exit 1; +fi + +SWBD_DIR=$1 + +dir=data/local/train_swbd +mkdir -p $dir + +# Audio data directory check +if [ ! -d $SWBD_DIR ]; then + echo "Error: run.sh requires a directory argument" + exit 1; +fi + +# Trans directory check +if [ ! -d $SWBD_DIR/transcriptions/swb_ms98_transcriptions ]; then + ( + cd $dir; + if [ ! -d swb_ms98_transcriptions ]; then + echo " *** Downloading trascriptions and dictionary ***" + wget http://www.openslr.org/resources/5/switchboard_word_alignments.tar.gz || + wget http://www.isip.piconepress.com/projects/switchboard/releases/switchboard_word_alignments.tar.gz + tar -xf switchboard_word_alignments.tar.gz + fi + ) +else + echo "Directory with transcriptions exists, skipping downloading" + [ -f $dir/swb_ms98_transcriptions ] \ + || ln -sf $SWBD_DIR/transcriptions/swb_ms98_transcriptions $dir/ +fi diff --git a/egs/fisher_swbd/s5/local/swbd1_data_prep.sh b/egs/fisher_swbd/s5/local/swbd1_data_prep.sh index 552e304a6a3..54513437dbe 100755 --- a/egs/fisher_swbd/s5/local/swbd1_data_prep.sh +++ b/egs/fisher_swbd/s5/local/swbd1_data_prep.sh @@ -14,7 +14,7 @@ #check existing directories if [ $# != 1 ]; then - echo "Usage: swbd1_data_prep_edin.sh /path/to/SWBD" + echo "Usage: swbd1_data_prep.sh /path/to/SWBD" exit 1; fi @@ -23,7 +23,6 @@ SWBD_DIR=$1 dir=data/local/train_swbd mkdir -p $dir - # Audio data directory check if [ ! -d $SWBD_DIR ]; then echo "Error: run.sh requires a directory argument" @@ -34,22 +33,6 @@ sph2pipe=$KALDI_ROOT/tools/sph2pipe_v2.5/sph2pipe [ ! -x $sph2pipe ] \ && echo "Could not execute the sph2pipe program at $sph2pipe" && exit 1; - -# Trans directory check -if [ ! -d $SWBD_DIR/transcriptions/swb_ms98_transcriptions ]; then - # To get the SWBD transcriptions and dict, do: - echo " *** Downloading transcriptions and dictionary ***" - ( - cd $dir; - wget http://www.isip.piconepress.com/projects/switchboard/releases/switchboard_word_alignments.tar.gz - tar -xf switchboard_word_alignments.tar.gz - ) -else - echo "Directory with transcriptions exists, skipping downloading" - [ -f $dir/swb_ms98_transcriptions ] \ - || ln -sf $SWBD_DIR/transcriptions/swb_ms98_transcriptions $dir/ -fi - # Option A: SWBD dictionary file check [ ! -f $dir/swb_ms98_transcriptions/sw-ms98-dict.text ] && \ echo "SWBD dictionary file does not exist" && exit 1; @@ -101,7 +84,7 @@ local/swbd1_map_words.pl -f 2- $dir/transcripts2.txt > $dir/text # final trans # format acronyms in text python local/map_acronyms_transcripts.py -i $dir/text -o $dir/text_map \ - -M data/local/dict/acronyms_swbd.map + -M data/local/dict_nosp/acronyms_swbd.map cp $dir/text $dir/text_bk mv $dir/text_map $dir/text diff --git a/egs/fisher_swbd/s5/path.sh b/egs/fisher_swbd/s5/path.sh index 3b05dc5e2ba..e14c6074f6b 100755 --- a/egs/fisher_swbd/s5/path.sh +++ b/egs/fisher_swbd/s5/path.sh @@ -1,4 +1,6 @@ export KALDI_ROOT=`pwd`/../../../ export PWD=`pwd` -export PATH=$KALDI_ROOT/src/ivectorbin:$KALDI_ROOT/src/onlinebin:$KALDI_ROOT/src/online2bin:$PWD/stanford-utils:$KALDI_ROOT/src/stanford-bin/:$PWD/utils/:$KALDI_ROOT/src/bin:$KALDI_ROOT/tools/openfst/bin:$KALDI_ROOT/src/fstbin/:$KALDI_ROOT/src/gmmbin/:$KALDI_ROOT/src/featbin/:$KALDI_ROOT/src/lm/:$KALDI_ROOT/src/lmbin/:$KALDI_ROOT/src/sgmmbin/:$KALDI_ROOT/src/sgmm2bin/:$KALDI_ROOT/src/fgmmbin/:$KALDI_ROOT/src/latbin/:$KALDI_ROOT/src/nnetbin:$KALDI_ROOT/src/nnet2bin:$KALDI_ROOT/src/nnet-cpubin/:$KALDI_ROOT/src/kwsbin:$PWD:$KALDI_ROOT/tools/kaldi_lm:$KALDI_ROOT/tools/srilm/bin:$KALDI_ROOT/tools/srilm/bin/i686-m64:$PATH +export PATH=$KALDI_ROOT/src/ivectorbin:$PWD/stanford-utils:$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$KALDI_ROOT/tools/kaldi_lm:$KALDI_ROOT/tools/srilm/bin:$KALDI_ROOT/tools/srilm/bin/i686-m64:$PATH +[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1 +. $KALDI_ROOT/tools/config/common_path.sh export LC_ALL=C diff --git a/egs/fisher_swbd/s5/run.sh b/egs/fisher_swbd/s5/run.sh index 4bb0a55b0a9..fa3ad62fa84 100755 --- a/egs/fisher_swbd/s5/run.sh +++ b/egs/fisher_swbd/s5/run.sh @@ -25,7 +25,6 @@ local/swbd1_data_prep.sh /export/corpora3/LDC/LDC97S62 # local/swbd1_data_prep.sh /mnt/matylda2/data/SWITCHBOARD_1R2 # local/swbd1_data_prep.sh /exports/work/inf_hcrc_cstr_general/corpora/switchboard/switchboard1 - utils/prepare_lang.sh data/local/dict_nosp \ "" data/local/lang_nosp data/lang_nosp @@ -135,15 +134,14 @@ local/remove_dup_utts.sh 300 data/train data/train_nodup ) # Start training on the Switchboard subset, which has cleaner alignments - steps/train_mono.sh --nj 3 --cmd "$train_cmd" \ - data/train_10k_nodup data/lang_nopp exp/mono0a + data/train_10k_nodup data/lang_nosp exp/mono0a steps/align_si.sh --nj 10 --cmd "$train_cmd" \ - data/train_30k_nodup data/lang_nopp exp/mono0a exp/mono0a_ali || exit 1; + data/train_30k_nodup data/lang_nosp exp/mono0a exp/mono0a_ali || exit 1; steps/train_deltas.sh --cmd "$train_cmd" \ - 3200 30000 data/train_30k_nodup data/lang_nopp exp/mono0a_ali exp/tri1a || exit 1; + 3200 30000 data/train_30k_nodup data/lang_nosp exp/mono0a_ali exp/tri1a || exit 1; #used to be 2500 20000 ( graph_dir=exp/tri1a/graph_nosp_fsh_sw1_tg diff --git a/egs/gale_arabic/s5/cmd.sh b/egs/gale_arabic/s5/cmd.sh index 6e2777b595b..71dd849a93b 100755 --- a/egs/gale_arabic/s5/cmd.sh +++ b/egs/gale_arabic/s5/cmd.sh @@ -1,11 +1,15 @@ -# "queue.pl" uses qsub. The options to it are -# options to qsub. If you have GridEngine installed, -# change this to a queue you have access to. -# Otherwise, use "run.pl", which will run jobs locally -# (make sure your --num-jobs options are no more than -# the number of cpus on your machine. - -export train_cmd="queue.pl -l 'arch=*64*'" -export decode_cmd="queue.pl -l 'arch=*64*'" -export cuda_cmd="queue.pl -l gpu=1" +# you can change cmd.sh depending on what type of queue you are using. +# If you have no queueing system and want to run on a local machine, you +# can change all instances 'queue.pl' to run.pl (but be careful and run +# commands one by one: most recipes will exhaust the memory on your +# machine). queue.pl works with GridEngine (qsub). slurm.pl works +# with slurm. Different queues are configured differently, with different +# queue names and different ways of specifying things like memory; +# to account for these differences you can create and edit the file +# conf/queue.conf to match your queue's configuration. Search for +# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, +# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. +export train_cmd="queue.pl --mem 2G" +export decode_cmd="queue.pl --mem 4G" +export mkgraph_cmd="queue.pl --mem 8G" diff --git a/egs/gale_arabic/s5/local/gale_format_data.sh b/egs/gale_arabic/s5/local/gale_format_data.sh index 584702b4122..6675dd20f71 100755 --- a/egs/gale_arabic/s5/local/gale_format_data.sh +++ b/egs/gale_arabic/s5/local/gale_format_data.sh @@ -6,9 +6,9 @@ if [ -f path.sh ]; then . path.sh; else echo "missing path.sh"; exit 1; -fi +fi -for dir in test train; do +for dir in test train; do cp -pr data/local/$dir data/$dir done @@ -21,26 +21,13 @@ arpa_lm=data/local/lm/3gram-mincount/lm_unpruned.gz rm -r data/lang_test cp -r data/lang data/lang_test -# grep -v ' ' etc. is only for future-proofing this script. Our -# LM doesn't have these "invalid combinations". These can cause -# determinization failures of CLG [ends up being epsilon cycles]. -# Note: remove_oovs.pl takes a list of words in the LM that aren't in -# our word list. Since our LM doesn't have any, we just give it -# /dev/null [we leave it in the script to show how you'd do it]. gunzip -c "$arpa_lm" | \ - grep -v ' ' | \ - grep -v ' ' | \ - grep -v ' ' | \ - arpa2fst - | fstprint | \ - utils/remove_oovs.pl /dev/null | \ - utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=data/lang_test/words.txt \ - --osymbols=data/lang_test/words.txt --keep_isymbols=false --keep_osymbols=false | \ - fstrmepsilon | fstarcsort --sort_type=ilabel > data/lang_test/G.fst - fstisstochastic data/lang_test/G.fst + arpa2fst --disambig-symbol=#0 \ + --read-symbol-table=data/lang_test/words.txt - data/lang_test/G.fst echo "Checking how stochastic G is (the first of these numbers should be small):" -fstisstochastic data/lang_test/G.fst +fstisstochastic data/lang_test/G.fst ## Check lexicon. ## just have a look and make sure it seems sane. diff --git a/egs/gale_arabic/s5/local/nnet/run_lstm.sh b/egs/gale_arabic/s5/local/nnet/run_lstm.sh index 39854360e14..aeb2272976b 100755 --- a/egs/gale_arabic/s5/local/nnet/run_lstm.sh +++ b/egs/gale_arabic/s5/local/nnet/run_lstm.sh @@ -45,7 +45,7 @@ if [ $stage -le 1 ]; then steps/nnet/train.sh --network-type lstm --learn-rate 0.00001 \ --cmvn-opts "--norm-means=true --norm-vars=true" --feat-type plain --splice 0 \ --proto-opts "--clip-gradient 5.0" \ - --train-opts "--momentum 0.9 --halving-factor 0.65" \ + --train-tool-opts "--momentum 0.9 --halving-factor 0.65" \ --train-tool "nnet-train-lstm-streams --num-stream=4 --targets-delay=5" \ ${train}_tr90 ${train}_cv10 data/lang $ali $ali $dir || exit 1; diff --git a/egs/gale_arabic/s5/local/online/run_nnet2.sh b/egs/gale_arabic/s5/local/online/run_nnet2.sh index 6926a3670be..8ccbda5a8dc 100644 --- a/egs/gale_arabic/s5/local/online/run_nnet2.sh +++ b/egs/gale_arabic/s5/local/online/run_nnet2.sh @@ -126,7 +126,7 @@ if [ $stage -le 6 ]; then # Because we have a lot of data here and we don't want the training to take # too long, we reduce the number of epochs from the defaults (15) to (8). - # The option "--io-opts '-tc 12'" is to have more than the default number + # The option "--io-opts '--max-jobs-run 12'" is to have more than the default number # (5) of jobs dumping the egs to disk; this is OK since we're splitting our # data across four filesystems for speed. @@ -139,7 +139,7 @@ if [ $stage -le 6 ]; then --num-threads "$num_threads" \ --minibatch-size "$minibatch_size" \ --parallel-opts "$parallel_opts" \ - --io-opts "-tc 12" \ + --io-opts "--max-jobs-run 12" \ --num-jobs-nnet 6 \ --num-hidden-layers 4 \ --mix-up 12000 \ diff --git a/egs/gale_arabic/s5/path.sh b/egs/gale_arabic/s5/path.sh index db21a99a725..be11b34cbc6 100755 --- a/egs/gale_arabic/s5/path.sh +++ b/egs/gale_arabic/s5/path.sh @@ -1,3 +1,5 @@ export KALDI_ROOT=$(pwd)/../../.. -export PATH=$PWD/utils/:$KALDI_ROOT/src/bin:$KALDI_ROOT/tools/openfst/bin:$KALDI_ROOT/src/fstbin/:$KALDI_ROOT/src/gmmbin/:$KALDI_ROOT/src/featbin/:$KALDI_ROOT/src/lm/:$KALDI_ROOT/src/sgmmbin/:$KALDI_ROOT/src/sgmm2bin/:$KALDI_ROOT/src/fgmmbin/:$KALDI_ROOT/src/latbin/:$KALDI_ROOT/src/nnetbin:$KALDI_ROOT/src/nnet2bin/:$KALDI_ROOT/src/kwsbin:$PWD:$PATH +export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH +[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1 +. $KALDI_ROOT/tools/config/common_path.sh export LC_ALL=C diff --git a/egs/gale_mandarin/s5/cmd.sh b/egs/gale_mandarin/s5/cmd.sh index 6e2777b595b..2d51ad82004 100755 --- a/egs/gale_mandarin/s5/cmd.sh +++ b/egs/gale_mandarin/s5/cmd.sh @@ -1,11 +1,18 @@ -# "queue.pl" uses qsub. The options to it are -# options to qsub. If you have GridEngine installed, -# change this to a queue you have access to. -# Otherwise, use "run.pl", which will run jobs locally -# (make sure your --num-jobs options are no more than -# the number of cpus on your machine. - -export train_cmd="queue.pl -l 'arch=*64*'" -export decode_cmd="queue.pl -l 'arch=*64*'" -export cuda_cmd="queue.pl -l gpu=1" +# you can change cmd.sh depending on what type of queue you are using. +# If you have no queueing system and want to run on a local machine, you +# can change all instances 'queue.pl' to run.pl (but be careful and run +# commands one by one: most recipes will exhaust the memory on your +# machine). queue.pl works with GridEngine (qsub). slurm.pl works +# with slurm. Different queues are configured differently, with different +# queue names and different ways of specifying things like memory; +# to account for these differences you can create and edit the file +# conf/queue.conf to match your queue's configuration. Search for +# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, +# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. +export train_cmd="queue.pl --mem 2G" +export decode_cmd="queue.pl --mem 4G" +export mkgraph_cmd="queue.pl --mem 8G" +# the use of cuda_cmd is deprecated, but it's still used in this example +# directory. +export cuda_cmd="queue.pl --gpu 1" diff --git a/egs/gale_mandarin/s5/local/gale_format_data.sh b/egs/gale_mandarin/s5/local/gale_format_data.sh index 15a2bfaef52..71187e89a12 100755 --- a/egs/gale_mandarin/s5/local/gale_format_data.sh +++ b/egs/gale_mandarin/s5/local/gale_format_data.sh @@ -6,9 +6,9 @@ if [ -f path.sh ]; then . path.sh; else echo "missing path.sh"; exit 1; -fi +fi -for dir in dev train; do +for dir in dev train; do cp -pr data/local/$dir data/$dir done @@ -22,26 +22,13 @@ arpa_lm=data/local/lm/3gram-mincount/lm_unpruned.gz rm -r data/lang_dev cp -r data/lang data/lang_dev -# grep -v ' ' etc. is only for future-proofing this script. Our -# LM doesn't have these "invalid combinations". These can cause -# determinization failures of CLG [ends up being epsilon cycles]. -# Note: remove_oovs.pl takes a list of words in the LM that aren't in -# our word list. Since our LM doesn't have any, we just give it -# /dev/null [we leave it in the script to show how you'd do it]. gunzip -c "$arpa_lm" | \ - grep -v ' ' | \ - grep -v ' ' | \ - grep -v ' ' | \ - arpa2fst - | fstprint | \ - utils/remove_oovs.pl /dev/null | \ - utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=data/lang_dev/words.txt \ - --osymbols=data/lang_dev/words.txt --keep_isymbols=false --keep_osymbols=false | \ - fstrmepsilon | fstarcsort --sort_type=ilabel > data/lang_dev/G.fst - fstisstochastic data/lang_dev/G.fst + arpa2fst --disambig-symbol=#0 \ + --read-symbol-table=data/lang_test/words.txt - data/lang_test/G.fst echo "Checking how stochastic G is (the first of these numbers should be small):" -fstisstochastic data/lang_dev/G.fst +fstisstochastic data/lang_dev/G.fst ## Check lexicon. ## just have a look and make sure it seems sane. diff --git a/egs/gale_mandarin/s5/path.sh b/egs/gale_mandarin/s5/path.sh index db21a99a725..be11b34cbc6 100755 --- a/egs/gale_mandarin/s5/path.sh +++ b/egs/gale_mandarin/s5/path.sh @@ -1,3 +1,5 @@ export KALDI_ROOT=$(pwd)/../../.. -export PATH=$PWD/utils/:$KALDI_ROOT/src/bin:$KALDI_ROOT/tools/openfst/bin:$KALDI_ROOT/src/fstbin/:$KALDI_ROOT/src/gmmbin/:$KALDI_ROOT/src/featbin/:$KALDI_ROOT/src/lm/:$KALDI_ROOT/src/sgmmbin/:$KALDI_ROOT/src/sgmm2bin/:$KALDI_ROOT/src/fgmmbin/:$KALDI_ROOT/src/latbin/:$KALDI_ROOT/src/nnetbin:$KALDI_ROOT/src/nnet2bin/:$KALDI_ROOT/src/kwsbin:$PWD:$PATH +export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH +[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1 +. $KALDI_ROOT/tools/config/common_path.sh export LC_ALL=C diff --git a/egs/gp/s1/local/gp_format_lms_edin.sh b/egs/gp/s1/local/gp_format_lms_edin.sh index 7fa6f181060..60e3c266d5c 100755 --- a/egs/gp/s1/local/gp_format_lms_edin.sh +++ b/egs/gp/s1/local/gp_format_lms_edin.sh @@ -40,20 +40,10 @@ function format_lms () { cp $work_dir/lang_test/$f $test done + # kkm: I am removing fstdeterminizelog from the following pipe, no point. gunzip -c $work_dir/local/lm_${lm_suffix}.arpa.gz \ - | find_arpa_oovs.pl $test/words.txt > $test/oovs_${lm_suffix}.txt - - # Removing all "illegal" combinations of and , which are supposed to - # occur only at being/end of utt. These can cause determinization failures - # of CLG [ends up being epsilon cycles]. - gunzip -c $work_dir/local/lm_${lm_suffix}.arpa.gz \ - | egrep -v ' | | ' \ - | arpa2fst - | fstprint \ - | remove_oovs.pl $test/oovs_${lm_suffix}.txt \ - | eps2disambig.pl | s2eps.pl \ - | fstcompile --isymbols=$test/words.txt --osymbols=$test/words.txt \ - --keep_isymbols=false --keep_osymbols=false \ - | fstrmepsilon | fstdeterminizelog > $test/G.fst + arpa2fst --disambig-symbol=#0 \ + --read-symbol-table=$test/words.txt - $test/G.fst set +e fstisstochastic $test/G.fst set -e @@ -73,7 +63,7 @@ function format_lms () { < $work_dir/local/lexicon_??.txt >tmpdir.g/select_empty.fst.txt fstcompile --isymbols=$test/words.txt --osymbols=$test/words.txt tmpdir.g/select_empty.fst.txt | \ fstarcsort --sort_type=olabel | fstcompose - $test/G.fst > tmpdir.g/empty_words.fst - fstinfo tmpdir.g/empty_words.fst | grep cyclic | grep -w 'y' && + fstinfo tmpdir.g/empty_words.fst | grep cyclic | grep -w 'y' && echo "Language model has cycles with empty words" && exit 1 rm -r tmpdir.g @@ -99,7 +89,7 @@ echo "Preparing language models for test" format_lms GE17k_tg $WDIR/GE; format_lms GE17k_tg_pr $WDIR/GE; } >& $WDIR/GE/format_lms.log -# German - 60K +# German - 60K { format_lms GE60k_bg $WDIR/GE; format_lms GE60k_tg $WDIR/GE; format_lms GE60k_tg_pr $WDIR/GE; } >> $WDIR/GE/format_lms.log 2>&1 @@ -115,7 +105,7 @@ echo "Preparing language models for test" format_lms SP23k_tg_pr $WDIR/SP; } >& $WDIR/SP/format_lms.log # Swedish - 24K -# TODO(arnab): Something going wrong with the Swedish trigram LM. +# TODO(arnab): Something going wrong with the Swedish trigram LM. { # format_lms SW24k_tg $WDIR/SW; # format_lms SW24k_tg_pr $WDIR/SW; format_lms SW24k_bg $WDIR/SW; } >& $WDIR/SW/format_lms.log diff --git a/egs/gp/s5/RESULTS b/egs/gp/s5/RESULTS index 760545cf59d..297ef23d2da 100644 --- a/egs/gp/s5/RESULTS +++ b/egs/gp/s5/RESULTS @@ -1,4 +1,94 @@ -$ for L in $GP_LANGUAGES; do grep WER exp/$L/mono/decode_dev_tgpr_sri/wer_* | ./utils/best_wer.sh ; doneexp/CZ/mono/decode_dev_tgpr_sri/wer_9:%WER 35.13 [ 5820 / 16568, 486 ins, 1116 del, 4218 sub ] +#!/bin/bash + +# this RESULTS file was obtained by Bogdan Vlasenko in February 2016. + +for x in exp/*/*/decode*; do [ -d $x ] && [[ $x =~ "$1" ]] && grep WER $x/wer_* | utils/best_wer.sh; done + +# Monophone, MFCC+delta+accel +%WER 45.16 [ 10073 / 22306, 684 ins, 2010 del, 7379 sub ] exp/FR/mono/decode_dev_tgpr_sri/wer_8 +%WER 26.96 [ 4149 / 15387, 285 ins, 933 del, 2931 sub ] exp/GE/mono/decode_dev_tgpr_sri/wer_11 +%WER 52.95 [ 10040 / 18962, 588 ins, 2182 del, 7270 sub ] exp/RU/mono/decode_dev_tgpr_sri/wer_8 + +%WER 41.80 [ 9071 / 21700, 513 ins, 1876 del, 6682 sub ] exp/FR/mono/decode_eval_tgpr_sri/wer_9 +%WER 44.71 [ 5347 / 11959, 399 ins, 1024 del, 3924 sub ] exp/GE/mono/decode_eval_tgpr_sri/wer_9 +%WER 51.55 [ 9416 / 18266, 533 ins, 1975 del, 6908 sub ] exp/RU/mono/decode_eval_tgpr_sri/wer_9 + +# First triphone build. +%WER 28.44 [ 6343 / 22306, 751 ins, 742 del, 4850 sub ] exp/FR/tri1/decode_dev_tgpr_sri/wer_14 +%WER 15.29 [ 2353 / 15387, 288 ins, 388 del, 1677 sub ] exp/GE/tri1/decode_dev_tgpr_sri/wer_18 +%WER 36.40 [ 6903 / 18962, 578 ins, 1876 del, 4449 sub ] exp/RU/tri1/decode_dev_tgpr_sri/wer_13 + +%WER 26.26 [ 5699 / 21700, 697 ins, 526 del, 4476 sub ] exp/FR/tri1/decode_eval_tgpr_sri/wer_13 +%WER 22.80 [ 2727 / 11959, 386 ins, 329 del, 2012 sub ] exp/GE/tri1/decode_eval_tgpr_sri/wer_15 +%WER 34.69 [ 6336 / 18266, 547 ins, 1470 del, 4319 sub ] exp/RU/tri1/decode_eval_tgpr_sri/wer_14 + +# tri2a is delta+delta-delta features. +%WER 28.45 [ 6345 / 22306, 771 ins, 725 del, 4849 sub ] exp/FR/tri2a/decode_dev_tgpr_sri/wer_14 +%WER 15.13 [ 2328 / 15387, 320 ins, 345 del, 1663 sub ] exp/GE/tri2a/decode_dev_tgpr_sri/wer_17 +%WER 36.62 [ 6944 / 18962, 526 ins, 2083 del, 4335 sub ] exp/RU/tri2a/decode_dev_tgpr_sri/wer_14 + +%WER 26.18 [ 5681 / 21700, 694 ins, 542 del, 4445 sub ] exp/FR/tri2a/decode_eval_tgpr_sri/wer_14 +%WER 22.52 [ 2693 / 11959, 341 ins, 363 del, 1989 sub ] exp/GE/tri2a/decode_eval_tgpr_sri/wer_17 +%WER 34.37 [ 6278 / 18266, 594 ins, 1378 del, 4306 sub ] exp/RU/tri2a/decode_eval_tgpr_sri/wer_14 + +# LDA+MLLT. +%WER 27.76 [ 6192 / 22306, 723 ins, 824 del, 4645 sub ] exp/FR/tri2b/decode_dev_tgpr_sri/wer_17 +%WER 13.78 [ 2121 / 15387, 300 ins, 313 del, 1508 sub ] exp/GE/tri2b/decode_dev_tgpr_sri/wer_18 +%WER 34.68 [ 6576 / 18962, 521 ins, 1872 del, 4183 sub ] exp/RU/tri2b/decode_dev_tgpr_sri/wer_15 + +%WER 25.43 [ 5519 / 21700, 724 ins, 532 del, 4263 sub ] exp/FR/tri2b/decode_eval_tgpr_sri/wer_15 +%WER 21.26 [ 2542 / 11959, 307 ins, 372 del, 1863 sub ] exp/GE/tri2b/decode_eval_tgpr_sri/wer_17 +%WER 32.83 [ 5997 / 18266, 522 ins, 1431 del, 4044 sub ] exp/RU/tri2b/decode_eval_tgpr_sri/wer_16 + +# LDA+MLLT+SAT. +%WER 25.62 [ 5714 / 22306, 746 ins, 634 del, 4334 sub ] exp/FR/tri3b/decode_dev_tgpr_sri/wer_18 +%WER 11.01 [ 1694 / 15387, 311 ins, 205 del, 1178 sub ] exp/GE/tri3b/decode_dev_tgpr_sri/wer_20 +%WER 32.48 [ 6159 / 18962, 556 ins, 1534 del, 4069 sub ] exp/RU/tri3b/decode_dev_tgpr_sri/wer_17 + +%WER 23.82 [ 5169 / 21700, 685 ins, 478 del, 4006 sub ] exp/FR/tri3b/decode_eval_tgpr_sri/wer_17 +%WER 17.72 [ 2119 / 11959, 329 ins, 248 del, 1542 sub ] exp/GE/tri3b/decode_eval_tgpr_sri/wer_18 +%WER 31.24 [ 5706 / 18266, 657 ins, 1046 del, 4003 sub ] exp/RU/tri3b/decode_eval_tgpr_sri/wer_16 + +# Some "SGMM2" experiments. +%WER 24.76 [ 5524 / 22306, 716 ins, 623 del, 4185 sub ] exp/FR/sgmm2_4a/decode_dev_tgpr_sri/wer_12 +%WER 9.61 [ 1478 / 15387, 253 ins, 174 del, 1051 sub ] exp/GE/sgmm2_4a/decode_dev_tgpr_sri/wer_13 +%WER 30.27 [ 5740 / 18962, 505 ins, 1301 del, 3934 sub ] exp/RU/sgmm2_4a/decode_dev_tgpr_sri/wer_12 + +%WER 22.88 [ 4965 / 21700, 675 ins, 430 del, 3860 sub ] exp/FR/sgmm2_4a/decode_eval_tgpr_sri/wer_11 +%WER 16.03 [ 1917 / 11959, 267 ins, 224 del, 1426 sub ] exp/GE/sgmm2_4a/decode_eval_tgpr_sri/wer_12 +%WER 29.06 [ 5309 / 18266, 494 ins, 1107 del, 3708 sub ] exp/RU/sgmm2_4a/decode_eval_tgpr_sri/wer_13 + +%WER 24.16 [ 5389 / 22306, 733 ins, 559 del, 4097 sub ] exp/FR/sgmm2_4a_mmi_b0.1/decode_dev_tgpr_sri_it1/wer_12 +%WER 24.02 [ 5359 / 22306, 733 ins, 534 del, 4092 sub ] exp/FR/sgmm2_4a_mmi_b0.1/decode_dev_tgpr_sri_it2/wer_12 +%WER 24.23 [ 5405 / 22306, 754 ins, 532 del, 4119 sub ] exp/FR/sgmm2_4a_mmi_b0.1/decode_dev_tgpr_sri_it3/wer_12 +%WER 24.50 [ 5464 / 22306, 727 ins, 574 del, 4163 sub ] exp/FR/sgmm2_4a_mmi_b0.1/decode_dev_tgpr_sri_it4/wer_13 +%WER 9.22 [ 1418 / 15387, 266 ins, 146 del, 1006 sub ] exp/GE/sgmm2_4a_mmi_b0.1/decode_dev_tgpr_sri_it1/wer_12 +%WER 9.17 [ 1411 / 15387, 253 ins, 153 del, 1005 sub ] exp/GE/sgmm2_4a_mmi_b0.1/decode_dev_tgpr_sri_it2/wer_13 +%WER 9.18 [ 1412 / 15387, 264 ins, 150 del, 998 sub ] exp/GE/sgmm2_4a_mmi_b0.1/decode_dev_tgpr_sri_it3/wer_13 +%WER 9.31 [ 1432 / 15387, 271 ins, 150 del, 1011 sub ] exp/GE/sgmm2_4a_mmi_b0.1/decode_dev_tgpr_sri_it4/wer_13 +%WER 29.96 [ 5681 / 18962, 465 ins, 1549 del, 3667 sub ] exp/RU/sgmm2_4a_mmi_b0.1/decode_dev_tgpr_sri_it1/wer_11 +%WER 30.39 [ 5762 / 18962, 500 ins, 1669 del, 3593 sub ] exp/RU/sgmm2_4a_mmi_b0.1/decode_dev_tgpr_sri_it2/wer_10 +%WER 31.00 [ 5879 / 18962, 420 ins, 1864 del, 3595 sub ] exp/RU/sgmm2_4a_mmi_b0.1/decode_dev_tgpr_sri_it3/wer_11 +%WER 31.50 [ 5973 / 18962, 433 ins, 1926 del, 3614 sub ] exp/RU/sgmm2_4a_mmi_b0.1/decode_dev_tgpr_sri_it4/wer_11 + +%WER 22.51 [ 4885 / 21700, 672 ins, 423 del, 3790 sub ] exp/FR/sgmm2_4a_mmi_b0.1/decode_eval_tgpr_sri_it1/wer_12 +%WER 22.56 [ 4896 / 21700, 702 ins, 380 del, 3814 sub ] exp/FR/sgmm2_4a_mmi_b0.1/decode_eval_tgpr_sri_it2/wer_11 +%WER 22.70 [ 4925 / 21700, 670 ins, 398 del, 3857 sub ] exp/FR/sgmm2_4a_mmi_b0.1/decode_eval_tgpr_sri_it3/wer_12 +%WER 22.83 [ 4954 / 21700, 681 ins, 400 del, 3873 sub ] exp/FR/sgmm2_4a_mmi_b0.1/decode_eval_tgpr_sri_it4/wer_12 +%WER 15.28 [ 1827 / 11959, 291 ins, 178 del, 1358 sub ] exp/GE/sgmm2_4a_mmi_b0.1/decode_eval_tgpr_sri_it1/wer_11 +%WER 15.22 [ 1820 / 11959, 271 ins, 190 del, 1359 sub ] exp/GE/sgmm2_4a_mmi_b0.1/decode_eval_tgpr_sri_it2/wer_12 +%WER 15.35 [ 1836 / 11959, 281 ins, 187 del, 1368 sub ] exp/GE/sgmm2_4a_mmi_b0.1/decode_eval_tgpr_sri_it3/wer_12 +%WER 15.38 [ 1839 / 11959, 252 ins, 205 del, 1382 sub ] exp/GE/sgmm2_4a_mmi_b0.1/decode_eval_tgpr_sri_it4/wer_13 +%WER 28.31 [ 5172 / 18266, 496 ins, 1127 del, 3549 sub ] exp/RU/sgmm2_4a_mmi_b0.1/decode_eval_tgpr_sri_it1/wer_11 +%WER 28.64 [ 5232 / 18266, 446 ins, 1321 del, 3465 sub ] exp/RU/sgmm2_4a_mmi_b0.1/decode_eval_tgpr_sri_it2/wer_11 +%WER 28.96 [ 5289 / 18266, 458 ins, 1334 del, 3497 sub ] exp/RU/sgmm2_4a_mmi_b0.1/decode_eval_tgpr_sri_it3/wer_10 +%WER 29.55 [ 5398 / 18266, 421 ins, 1477 del, 3500 sub ] exp/RU/sgmm2_4a_mmi_b0.1/decode_eval_tgpr_sri_it4/wer_11 + + +# these RESULTS were obtained with Arnab Ghoshal version of the script established in 2012. +$ for L in $GP_LANGUAGES; do grep WER exp/$L/mono/decode_dev_tgpr_sri/wer_* | ./utils/best_wer.sh ; done + +exp/CZ/mono/decode_dev_tgpr_sri/wer_9:%WER 35.13 [ 5820 / 16568, 486 ins, 1116 del, 4218 sub ] exp/FR/mono/decode_dev_tgpr_sri/wer_9:%WER 45.69 [ 10192 / 22306, 533 ins, 2323 del, 7336 sub ] exp/GE/mono/decode_dev_tgpr_sri/wer_11:%WER 27.48 [ 4228 / 15387, 278 ins, 974 del, 2976 sub ] exp/PL/mono/decode_dev_tgpr_sri/wer_9:%WER 36.45 [ 6437 / 17660, 607 ins, 1228 del, 4602 sub ] @@ -16,5 +106,3 @@ exp/PO/tri1/decode_dev_tgpr_sri/wer_14:%WER 26.56 [ 3461 / 13030, 477 ins, 795 d exp/RU/tri1/decode_dev_tgpr_sri/wer_14:%WER 33.89 [ 6427 / 18962, 575 ins, 1612 del, 4240 sub ] exp/SP/tri1/decode_dev_tgpr_sri/wer_16:%WER 26.73 [ 5105 / 19098, 551 ins, 1313 del, 3241 sub ] qghoshal@merlin:[~/globalphone/a1.1] - - diff --git a/egs/gp/s5/run.sh b/egs/gp/s5/run.sh index ed345efef14..933d3a4f566 100755 --- a/egs/gp/s5/run.sh +++ b/egs/gp/s5/run.sh @@ -2,6 +2,16 @@ # Copyright 2012 Arnab Ghoshal +# +# Copyright 2016 by Idiap Research Institute, http://www.idiap.ch +# +# See the file COPYING for the licence associated with this software. +# +# Author(s): +# Bogdan Vlasenko, February 2016 +# + + # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at @@ -22,47 +32,37 @@ echo "This shell script may run as-is on your system, but it is recommended that you run the commands one by one by copying and pasting into the shell." #exit 1; -[ -f cmd.sh ] && source ./cmd.sh \ - || echo "cmd.sh not found. Jobs may not execute properly." +[ -f cmd.sh ] && source ./cmd.sh || echo "cmd.sh not found. Jobs may not execute properly." # CHECKING FOR AND INSTALLING REQUIRED TOOLS: # This recipe requires shorten (3.6.1) and sox (14.3.2). # If they are not found, the local/gp_install.sh script will install them. -local/gp_check_tools.sh $PWD path.sh +#local/gp_check_tools.sh $PWD path.sh || exit 1; . path.sh || { echo "Cannot source path.sh"; exit 1; } # Set the locations of the GlobalPhone corpus and language models -GP_CORPUS=/mnt/matylda2/data/GLOBALPHONE -# GP_LM=/mnt/matylda6/ijanda/GLOBALPHONE_LM +GP_CORPUS=/idiap/resource/database/GLOBALPHONE GP_LM=$PWD/language_models # Set the languages that will actually be processed -# export GP_LANGUAGES="CZ FR GE PL PO RU SP VN" -export GP_LANGUAGES="CZ FR GE PL PO SP" +export GP_LANGUAGES="FR GE RU" # The following data preparation step actually converts the audio files from # shorten to WAV to take out the empty files and those with compression errors. -local/gp_data_prep.sh --config-dir=$PWD/conf --corpus-dir=$GP_CORPUS \ - --languages="$GP_LANGUAGES" -local/gp_dict_prep.sh --config-dir $PWD/conf $GP_CORPUS $GP_LANGUAGES -# # Use the following to map to X-SAMPA phoneset -# local/gp_dict_prep.sh --config-dir $PWD/conf --map-dir $PWD/conf/xsampa_map \ -# $GP_CORPUS $GP_LANGUAGES +local/gp_data_prep.sh --config-dir=$PWD/conf --corpus-dir=$GP_CORPUS --languages="$GP_LANGUAGES" || exit 1; +local/gp_dict_prep.sh --config-dir $PWD/conf $GP_CORPUS $GP_LANGUAGES || exit 1; for L in $GP_LANGUAGES; do - utils/prepare_lang.sh --position-dependent-phones true \ - data/$L/local/dict "" data/$L/local/lang_tmp data/$L/lang \ - >& data/$L/prepare_lang.log || exit 1; + utils/prepare_lang.sh --position-dependent-phones true \ + data/$L/local/dict "" data/$L/local/lang_tmp data/$L/lang \ + >& data/$L/prepare_lang.log || exit 1; done # Convert the different available language models to FSTs, and create separate # decoding configurations for each. for L in $GP_LANGUAGES; do - # $highmem_cmd data/$L/format_lm.log \ - # local/gp_format_lm.sh --filter-vocab-sri false $GP_LM $L & - $highmem_cmd data/$L/format_lm.log \ - local/gp_format_lm.sh --filter-vocab-sri true $GP_LM $L & + local/gp_format_lm.sh --filter-vocab-sri true $GP_LM $L & done wait @@ -72,13 +72,14 @@ for L in $GP_LANGUAGES; do for x in train dev eval; do ( steps/make_mfcc.sh --nj 6 --cmd "$train_cmd" data/$L/$x \ - exp/$L/make_mfcc/$x $mfccdir; + exp/$L/make_mfcc/$x $mfccdir; steps/compute_cmvn_stats.sh data/$L/$x exp/$L/make_mfcc/$x $mfccdir; ) & done done wait; + for L in $GP_LANGUAGES; do mkdir -p exp/$L/mono; steps/train_mono.sh --nj 10 --cmd "$train_cmd" \ @@ -86,86 +87,107 @@ for L in $GP_LANGUAGES; do done wait; + for L in $GP_LANGUAGES; do for lm_suffix in tgpr_sri; do ( graph_dir=exp/$L/mono/graph_${lm_suffix} mkdir -p $graph_dir - $highmem_cmd $graph_dir/mkgraph.log \ - utils/mkgraph.sh --mono data/$L/lang_test_${lm_suffix} exp/$L/mono \ - $graph_dir + utils/mkgraph.sh --mono data/$L/lang_test_${lm_suffix} exp/$L/mono \ + $graph_dir steps/decode.sh --nj 5 --cmd "$decode_cmd" $graph_dir data/$L/dev \ - exp/$L/mono/decode_dev_${lm_suffix} + exp/$L/mono/decode_dev_${lm_suffix} + steps/decode.sh --nj 5 --cmd "$decode_cmd" $graph_dir data/$L/eval \ + exp/$L/mono/decode_eval_${lm_suffix} ) & done done - +# Train tri1, which is first triphone pass for L in $GP_LANGUAGES; do ( mkdir -p exp/$L/mono_ali steps/align_si.sh --nj 10 --cmd "$train_cmd" \ - data/$L/train data/$L/lang exp/$L/mono exp/$L/mono_ali \ - >& exp/$L/mono_ali/align.log + data/$L/train data/$L/lang exp/$L/mono exp/$L/mono_ali \ + >& exp/$L/mono_ali/align.log num_states=$(grep "^$L" conf/tri.conf | cut -f2) num_gauss=$(grep "^$L" conf/tri.conf | cut -f3) mkdir -p exp/$L/tri1 - steps/train_deltas.sh --cmd "$train_cmd" --cluster-thresh 100 \ - $num_states $num_gauss data/$L/train data/$L/lang exp/$L/mono_ali \ - exp/$L/tri1 >& exp/$L/tri1/train.log - ) & + steps/train_deltas.sh --cmd "$train_cmd" \ + --cluster-thresh 100 $num_states $num_gauss data/$L/train data/$L/lang \ + exp/$L/mono_ali exp/$L/tri1 >& exp/$L/tri1/train.log + ) & done wait; - +# Decode tri1 for L in $GP_LANGUAGES; do for lm_suffix in tgpr_sri; do ( graph_dir=exp/$L/tri1/graph_${lm_suffix} mkdir -p $graph_dir - $highmem_cmd $graph_dir/mkgraph.log \ - utils/mkgraph.sh data/$L/lang_test_${lm_suffix} exp/$L/tri1 $graph_dir + utils/mkgraph.sh data/$L/lang_test_${lm_suffix} exp/$L/tri1 \ + $graph_dir steps/decode.sh --nj 5 --cmd "$decode_cmd" $graph_dir data/$L/dev \ - exp/$L/tri1/decode_dev_${lm_suffix} + exp/$L/tri1/decode_dev_${lm_suffix} + steps/decode.sh --nj 5 --cmd "$decode_cmd" $graph_dir data/$L/eval \ + exp/$L/tri1/decode_eval_${lm_suffix} ) & done done -# SAT-trained triphone systems: MFCC feats + +# Train tri2a, which is deltas + delta-deltas for L in $GP_LANGUAGES; do ( - mkdir -p exp/$L/tri1_ali_fmllr - steps/align_fmllr.sh --nj 10 --cmd "$train_cmd" \ - data/$L/train data/$L/lang exp/$L/tri1 exp/$L/tri1_ali_fmllr \ - >& exp/$L/tri1_ali_fmllr/align.log || exit 1; + mkdir -p exp/$L/tri1_ali + steps/align_si.sh --nj 10 --cmd "$train_cmd" \ + data/$L/train data/$L/lang exp/$L/tri1 exp/$L/tri1_ali \ + >& exp/$L/tri1_ali/tri1_ali.log num_states=$(grep "^$L" conf/tri.conf | cut -f2) num_gauss=$(grep "^$L" conf/tri.conf | cut -f3) mkdir -p exp/$L/tri2a - steps/train_sat.sh --cmd "$train_cmd" --cluster-thresh 100 \ - $num_states $num_gauss data/$L/train data/$L/lang exp/$L/tri1_ali_fmllr \ - exp/$L/tri2a >& exp/$L/tri2a/train.log + steps/train_deltas.sh --cmd "$train_cmd" \ + --cluster-thresh 100 $num_states $num_gauss data/$L/train data/$L/lang \ + exp/$L/tri1_ali exp/$L/tri2a >& exp/$L/tri2a/train.log ) & done wait; +# Decode tri2a for L in $GP_LANGUAGES; do for lm_suffix in tgpr_sri; do ( graph_dir=exp/$L/tri2a/graph_${lm_suffix} mkdir -p $graph_dir - $highmem_cmd $graph_dir/mkgraph.log \ - utils/mkgraph.sh data/$L/lang_test_${lm_suffix} exp/$L/tri2a $graph_dir + utils/mkgraph.sh data/$L/lang_test_${lm_suffix} exp/$L/tri2a \ + $graph_dir - steps/decode_fmllr.sh --nj 5 --cmd "$decode_cmd" $graph_dir data/$L/dev \ - exp/$L/tri2a/decode_dev_${lm_suffix} + steps/decode.sh --nj 5 --cmd "$decode_cmd" $graph_dir data/$L/dev \ + exp/$L/tri2a/decode_dev_${lm_suffix} + steps/decode.sh --nj 5 --cmd "$decode_cmd" $graph_dir data/$L/eval \ + exp/$L/tri2a/decode_eval_${lm_suffix} ) & done done +# Train tri2b, which is LDA+MLLT +for L in $GP_LANGUAGES; do + ( + num_states=$(grep "^$L" conf/tri.conf | cut -f2) + num_gauss=$(grep "^$L" conf/tri.conf | cut -f3) + mkdir -p exp/$L/tri2b + steps/train_lda_mllt.sh --cmd "$train_cmd" \ + --splice-opts "--left-context=3 --right-context=3" $num_states $num_gauss data/$L/train \ + data/$L/lang exp/$L/tri1_ali exp/$L/tri2b >& exp/$L/tri2b/tri2_ali.log + ) & +done +wait; + # for L in $GP_LANGUAGES; do # mode=4 # # Doing this only for the LMs whose vocabs were limited using SRILM, since the @@ -175,19 +197,152 @@ done # exp/$L/tri2a/decode_dev_tgpr_sri exp/$L/tri2a/decode_dev_tg_sri$mode # done +# Decode tri2b +for L in $GP_LANGUAGES; do + for lm_suffix in tgpr_sri; do + ( + graph_dir=exp/$L/tri2b/graph_${lm_suffix} + mkdir -p $graph_dir + utils/mkgraph.sh data/$L/lang_test_${lm_suffix} exp/$L/tri2b \ + $graph_dir + + steps/decode.sh --nj 5 --cmd "$decode_cmd" $graph_dir data/$L/dev \ + exp/$L/tri2b/decode_dev_${lm_suffix} + steps/decode.sh --nj 5 --cmd "$decode_cmd" $graph_dir data/$L/eval \ + exp/$L/tri2b/decode_eval_${lm_suffix} + ) & + done +done +wait; + +# Train tri3b, which is LDA+MLLT+SAT. +for L in $GP_LANGUAGES; do + ( + mkdir -p exp/$L/tri2b_ali + steps/align_si.sh --nj 10 --cmd "$train_cmd" \ + --use-graphs true data/$L/train data/$L/lang exp/$L/tri2b exp/$L/tri2b_ali \ + >& exp/$L/tri2b_ali/align.log + + num_states=$(grep "^$L" conf/tri.conf | cut -f2) + num_gauss=$(grep "^$L" conf/tri.conf | cut -f3) + mkdir -p exp/$L/tri3b + steps/train_sat.sh --cmd "$train_cmd" \ + --cluster-thresh 100 $num_states $num_gauss data/$L/train data/$L/lang \ + exp/$L/tri2b_ali exp/$L/tri3b >& exp/$L/tri3b/train.log + ) & +done +wait; + +# Decode 3b +for L in $GP_LANGUAGES; do + for lm_suffix in tgpr_sri; do + ( + graph_dir=exp/$L/tri3b/graph_${lm_suffix} + mkdir -p $graph_dir + utils/mkgraph.sh data/$L/lang_test_${lm_suffix} exp/$L/tri3b \ + $graph_dir + + mkdir -p exp/$L/tri3b/decode_dev_${lm_suffix} + steps/decode_fmllr.sh --nj 5 --cmd "$decode_cmd" \ + $graph_dir data/$L/dev exp/$L/tri3b/decode_dev_${lm_suffix} + steps/decode_fmllr.sh --nj 5 --cmd "$decode_cmd" \ + $graph_dir data/$L/eval exp/$L/tri3b/decode_eval_${lm_suffix} + ) & +done +done +wait; + +## Train sgmm2b, which is SGMM on top of LDA+MLLT+SAT features. +for L in $GP_LANGUAGES; do + ( + mkdir -p exp/$L/tri3b_ali + steps/align_fmllr.sh --nj 10 --cmd "$train_cmd" \ + data/$L/train data/$L/lang exp/$L/tri3b exp/$L/tri3b_ali + + num_states=$(grep "^$L" conf/sgmm.conf | cut -f2) + num_substates=$(grep "^$L" conf/sgmm.conf | cut -f3) + mkdir -p exp/$L/ubm4a + steps/train_ubm.sh --cmd "$train_cmd" \ + 600 data/$L/train data/$L/lang exp/$L/tri3b_ali exp/$L/ubm4a + + mkdir -p exp/$L/sgmm2_4a + steps/train_sgmm2.sh --cmd "$train_cmd" \ + $num_states $num_substates data/$L/train data/$L/lang exp/$L/tri3b_ali \ + exp/$L/ubm4a/final.ubm exp/$L/sgmm2_4a + ) & +done +wait; + +## Decode sgmm2_4a +for L in $GP_LANGUAGES; do + for lm_suffix in tgpr_sri; do + ( + graph_dir=exp/$L/sgmm2_4a/graph_${lm_suffix} + mkdir -p $graph_dir + utils/mkgraph.sh data/$L/lang_test_${lm_suffix} exp/$L/sgmm2_4a \ + $graph_dir + + steps/decode_sgmm2.sh --use-fmllr true --nj 5 --cmd "$decode_cmd" \ + --transform-dir exp/$L/tri3b/decode_dev_${lm_suffix} $graph_dir data/$L/dev \ + exp/$L/sgmm2_4a/decode_dev_${lm_suffix} + steps/decode_sgmm2.sh --use-fmllr true --nj 5 --cmd "$decode_cmd" \ + --transform-dir exp/$L/tri3b/decode_eval_${lm_suffix} $graph_dir data/$L/eval \ + exp/$L/sgmm2_4a/decode_eval_${lm_suffix} + ) + done +done +wait; + + +# Now we'll align the SGMM system to prepare for discriminative training MMI +for L in $GP_LANGUAGES; do + for lm_suffix in tgpr_sri; do + ( + mkdir -p exp/$L/sgmm2_4a_ali + steps/align_sgmm2.sh --nj 10 --cmd "$train_cmd" \ + --transform-dir exp/$L/tri3b_ali --use-graphs true --use-gselect true data/$L/train \ + data/$L/lang exp/$L/sgmm2_4a exp/$L/sgmm2_4a_ali + + mkdir -p exp/$L/sgmm2_4a_denlats + steps/make_denlats_sgmm2.sh --nj 10 --sub-split 10 --cmd "$decode_cmd" \ + --transform-dir exp/$L/tri3b_ali data/$L/train data/$L/lang \ + exp/$L/sgmm2_4a_ali exp/$L/sgmm2_4a_denlats + mkdir -p exp/$L/sgmm2_4a_mmi_b0.1 + steps/train_mmi_sgmm2.sh --cmd "$decode_cmd" \ + --transform-dir exp/$L/tri3b_ali --boost 0.1 data/$L/train data/$L/lang \ + exp/$L/sgmm2_4a_ali exp/$L/sgmm2_4a_denlats exp/$L/sgmm2_4a_mmi_b0.1 + ) & + done +done +wait; + +# decode sgmm2_4a-mmi_b0.1 +for L in $GP_LANGUAGES; do + for lm_suffix in tgpr_sri; do + ( + graph_dir=exp/$L/sgmm2_4a/graph_${lm_suffix} + for iter in 1 2 3 4; do + for test in dev eval; do + steps/decode_sgmm2_rescore.sh --cmd "$decode_cmd" \ + --iter $iter --transform-dir exp/$L/tri3b/decode_${test}_${lm_suffix} data/$L/lang_test_${lm_suffix} \ + data/$L/${test} exp/$L/sgmm2_4a/decode_${test}_${lm_suffix} \ + exp/$L/sgmm2_4a_mmi_b0.1/decode_${test}_${lm_suffix}_it$iter + done + done + ) & + done +done +wait; + + # SGMMs starting from non-SAT triphone system, both with and without # speaker vectors. for L in $GP_LANGUAGES; do ( - mkdir -p exp/$L/tri1_ali - steps/align_si.sh --nj 10 --cmd "$train_cmd" \ - data/$L/train data/$L/lang exp/$L/tri1 exp/$L/tri1_ali \ - >& exp/$L/tri1_ali/align.log - mkdir -p exp/$L/ubm2a - steps/train_ubm.sh --cmd "$train_cmd" \ - 400 data/$L/train data/$L/lang exp/$L/tri1_ali exp/$L/ubm2a \ - >& exp/$L/ubm2a/train.log || exit 1; + steps/train_ubm.sh --cmd "$train_cmd" \ + 400 data/$L/train data/$L/lang exp/$L/tri1_ali exp/$L/ubm2a \ + >& exp/$L/ubm2a/train.log num_states=$(grep "^$L" conf/sgmm.conf | cut -f2) num_substates=$(grep "^$L" conf/sgmm.conf | cut -f3) @@ -222,35 +377,5 @@ for L in $GP_LANGUAGES; do done # loop over model with and without speaker vecs done # loop over languages -# Train SGMMs using SAT features -for L in $GP_LANGUAGES; do - ( - mkdir -p exp/$L/ubm2c - steps/train_ubm.sh --cmd "$train_cmd" \ - 400 data/$L/train data/$L/lang exp/$L/tri1_ali_fmllr exp/$L/ubm2c \ - >& exp/$L/ubm2c/train.log || exit 1; - num_states=$(grep "^$L" conf/tri.conf | cut -f2) - num_gauss=$(grep "^$L" conf/tri.conf | cut -f3) - mkdir -p exp/$L/sgmm2c - steps/train_sgmm.sh --cmd "$train_cmd" --cluster-thresh 100 \ - $num_states $num_gauss data/$L/train data/$L/lang exp/$L/tri1_ali_fmllr \ - exp/$L/ubm2c/final.ubm exp/$L/sgmm2c >& exp/$L/sgmm2c/train.log - ) & -done -wait - -for L in $GP_LANGUAGES; do - for lm_suffix in tgpr_sri; do - ( - graph_dir=exp/$L/sgmm2c/graph_${lm_suffix} - mkdir -p $graph_dir - $highmem_cmd $graph_dir/mkgraph.log \ - utils/mkgraph.sh data/$L/lang_test_${lm_suffix} exp/$L/sgmm2c $graph_dir - steps/decode_sgmm.sh --nj 5 --cmd "$decode_cmd" \ - --transform-dir exp/$L/tri2a/decode_dev_${lm_suffix} \ - $graph_dir data/$L/dev exp/$L/sgmm2c/decode_dev_${lm_suffix} - ) & - done -done diff --git a/egs/hkust/s5/RESULTS b/egs/hkust/s5/RESULTS index 851e2a1ced1..33d9f713e75 100644 --- a/egs/hkust/s5/RESULTS +++ b/egs/hkust/s5/RESULTS @@ -20,4 +20,8 @@ exp/lstm5e/decode/cer_10:%WER 37.61 [ 21121 / 56154, 1829 ins, 3941 del, 15351 s # nnet2 results exp/nnet2_5d/decode/cer_10:%WER 38.59 [ 21669 / 56154, 2498 ins, 3581 del, 15590 sub ] # ConvNet with 2 convolutional layers and 2 ReLU layers -exp/nnet2_convnet/decode/cer_10:%WER 40.73 [ 22873 / 56154, 2609 ins, 3712 del, 16552 sub ] +exp/nnet2_convnet/decode/cer_10:%WER 41.19 [ 23129 / 56154, 2599 ins, 3782 del, 16748 sub ] + +# nnet3 results (using speed perturbed data) +exp/nnet3/tdnn_sp/decode_dev/cer_10:%WER 33.79 [ 18977 / 56154, 2027 ins, 3485 del, 13465 sub ] +exp/nnet3/lstm_sp_ld5/decode_dev/cer_9:%WER 33.51 [ 18815 / 56154, 1813 ins, 3249 del, 13753 sub ] \ No newline at end of file diff --git a/egs/hkust/s5/cmd.sh b/egs/hkust/s5/cmd.sh index 2a46d89f385..71dd849a93b 100644 --- a/egs/hkust/s5/cmd.sh +++ b/egs/hkust/s5/cmd.sh @@ -1,13 +1,15 @@ -# "queue.pl" uses qsub. The options to it are -# options to qsub. If you have GridEngine installed, -# change this to a queue you have access to. -# Otherwise, use "run.pl", which will run jobs locally -# (make sure your --num-jobs options are no more than -# the number of cpus on your machine. - -export train_cmd="queue.pl -q all.q@a*.clsp.jhu.edu" -export decode_cmd="queue.pl -q all.q@a*.clsp.jhu.edu" -#export train_cmd=run.pl -#export decode_cmd=run.pl - +# you can change cmd.sh depending on what type of queue you are using. +# If you have no queueing system and want to run on a local machine, you +# can change all instances 'queue.pl' to run.pl (but be careful and run +# commands one by one: most recipes will exhaust the memory on your +# machine). queue.pl works with GridEngine (qsub). slurm.pl works +# with slurm. Different queues are configured differently, with different +# queue names and different ways of specifying things like memory; +# to account for these differences you can create and edit the file +# conf/queue.conf to match your queue's configuration. Search for +# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, +# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. +export train_cmd="queue.pl --mem 2G" +export decode_cmd="queue.pl --mem 4G" +export mkgraph_cmd="queue.pl --mem 8G" diff --git a/egs/hkust/s5/local/create_oov_char_lexicon.pl b/egs/hkust/s5/local/create_oov_char_lexicon.pl new file mode 100644 index 00000000000..aaf5d3bcb9b --- /dev/null +++ b/egs/hkust/s5/local/create_oov_char_lexicon.pl @@ -0,0 +1,46 @@ +#!/usr/bin/perl +# Copyright 2016 LeSpeech (Author: Xingyu Na) +# +# A script for char-based Chinese OOV lexicon generation. +# +# Input 1: char-based dictionary, example +# CHAR1 ph1 ph2 +# CHAR2 ph3 +# CHAR3 ph2 ph4 +# +# Input 2: OOV word list, example +# WORD1 +# WORD2 +# WORD3 +# +# where WORD1 is in the format of "CHAR1CHAR2". +# +# Output: OOV lexicon, in the format of normal lexicon + +if($#ARGV != 1) { + print STDERR "usage: perl create_oov_char_lexicon.pl chardict oovwordlist > oovlex\n\n"; + print STDERR "### chardict: a dict in which each line contains the pronunciation of one Chinese char\n"; + print STDERR "### oovwordlist: OOV word list\n"; + print STDERR "### oovlex: output OOV lexicon\n"; + exit; +} + +use encoding utf8; +my %prons; +open(DICT, $ARGV[0]) || die("Can't open dict ".$ARGV[0]."\n"); +foreach () { + chomp; @A = split(" ", $_); $prons{$A[0]} = $A[1]; +} +close DICT; + +open(WORDS, $ARGV[1]) || die("Can't open oov word list ".$ARGV[1]."\n"); +while () { + chomp; + print $_; + @A = split("", $_); + foreach (@A) { + print " $prons{$_}"; + } + print "\n"; +} +close WORDS; diff --git a/egs/hkust/s5/local/hkust_format_data.sh b/egs/hkust/s5/local/hkust_format_data.sh index 4f517e6dd1a..33cf8fa22ef 100755 --- a/egs/hkust/s5/local/hkust_format_data.sh +++ b/egs/hkust/s5/local/hkust_format_data.sh @@ -1,4 +1,4 @@ -#!/bin/bash +#!/bin/bash # if [ -f path.sh ]; then . path.sh; fi @@ -23,26 +23,13 @@ done rm -r data/lang_test cp -r data/lang data/lang_test -# grep -v ' ' etc. is only for future-proofing this script. Our -# LM doesn't have these "invalid combinations". These can cause -# determinization failures of CLG [ends up being epsilon cycles]. -# Note: remove_oovs.pl takes a list of words in the LM that aren't in -# our word list. Since our LM doesn't have any, we just give it -# /dev/null [we leave it in the script to show how you'd do it]. gunzip -c "$arpa_lm" | \ - grep -v ' ' | \ - grep -v ' ' | \ - grep -v ' ' | \ - arpa2fst - | fstprint | \ - utils/remove_oovs.pl /dev/null | \ - utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=data/lang_test/words.txt \ - --osymbols=data/lang_test/words.txt --keep_isymbols=false --keep_osymbols=false | \ - fstrmepsilon | fstarcsort --sort_type=ilabel > data/lang_test/G.fst - fstisstochastic data/lang_test/G.fst + arpa2fst --disambig-symbol=#0 \ + --read-symbol-table=data/lang_test/words.txt - data/lang_test/G.fst echo "Checking how stochastic G is (the first of these numbers should be small):" -fstisstochastic data/lang_test/G.fst +fstisstochastic data/lang_test/G.fst ## Check lexicon. ## just have a look and make sure it seems sane. @@ -71,4 +58,3 @@ fsttablecompose data/lang/L_disambig.fst data/lang_test/G.fst | \ echo hkust_format_data succeeded. - diff --git a/egs/hkust/s5/local/hkust_prepare_dict.sh b/egs/hkust/s5/local/hkust_prepare_dict.sh index cd3ed602c70..dc9e5262dfb 100755 --- a/egs/hkust/s5/local/hkust_prepare_dict.sh +++ b/egs/hkust/s5/local/hkust_prepare_dict.sh @@ -1,4 +1,6 @@ #!/bin/bash +# Copyright 2016 LeSpeech (Author: Xingyu Na) + # prepare dictionary for HKUST # it is done for English and Chinese separately, # For English, we use CMU dictionary, and Sequitur G2P @@ -14,23 +16,19 @@ train_dir=data/local/train dev_dir=data/local/dev dict_dir=data/local/dict mkdir -p $dict_dir - -case 0 in #goto here - 1) -;; #here: -esac - - +mkdir -p $dict_dir/lexicon-{en,ch} + # extract full vocabulary cat $train_dir/text $dev_dir/text | awk '{for (i = 2; i <= NF; i++) print $i}' |\ sed -e 's/ /\n/g' | sort -u | grep -v '\[LAUGHTER\]' | grep -v '\[NOISE\]' |\ - grep -v '\[VOCALIZED-NOISE\]' > $dict_dir/vocab-full.txt + grep -v '\[VOCALIZED-NOISE\]' > $dict_dir/words.txt # split into English and Chinese -cat $dict_dir/vocab-full.txt | grep '[a-zA-Z]' > $dict_dir/vocab-en.txt -cat $dict_dir/vocab-full.txt | grep -v '[a-zA-Z]' > $dict_dir/vocab-ch.txt +cat $dict_dir/words.txt | grep '[a-zA-Z]' > $dict_dir/lexicon-en/words-en.txt +cat $dict_dir/words.txt | grep -v '[a-zA-Z]' > $dict_dir/lexicon-ch/words-ch.txt + -# produce pronunciations for english +##### produce pronunciations for english if [ ! -f $dict_dir/cmudict/cmudict.0.7a ]; then echo "--- Downloading CMU dictionary ..." svn co https://svn.code.sf.net/p/cmusphinx/code/trunk/cmudict \ @@ -40,19 +38,19 @@ fi echo "--- Striping stress and pronunciation variant markers from cmudict ..." perl $dict_dir/cmudict/scripts/make_baseform.pl \ $dict_dir/cmudict/cmudict.0.7a /dev/stdout |\ - sed -e 's:^\([^\s(]\+\)([0-9]\+)\(\s\+\)\(.*\):\1\2\3:' > $dict_dir/cmudict-plain.txt + sed -e 's:^\([^\s(]\+\)([0-9]\+)\(\s\+\)\(.*\):\1\2\3:' > $dict_dir/cmudict/cmudict-plain.txt echo "--- Searching for English OOV words ..." gawk 'NR==FNR{words[$1]; next;} !($1 in words)' \ - $dict_dir/cmudict-plain.txt $dict_dir/vocab-en.txt |\ - egrep -v '<.?s>' > $dict_dir/vocab-en-oov.txt + $dict_dir/cmudict/cmudict-plain.txt $dict_dir/lexicon-en/words-en.txt |\ + egrep -v '<.?s>' > $dict_dir/lexicon-en/words-en-oov.txt gawk 'NR==FNR{words[$1]; next;} ($1 in words)' \ - $dict_dir/vocab-en.txt $dict_dir/cmudict-plain.txt |\ - egrep -v '<.?s>' > $dict_dir/lexicon-en-iv.txt + $dict_dir/lexicon-en/words-en.txt $dict_dir/cmudict/cmudict-plain.txt |\ + egrep -v '<.?s>' > $dict_dir/lexicon-en/lexicon-en-iv.txt -wc -l $dict_dir/vocab-en-oov.txt -wc -l $dict_dir/lexicon-en-iv.txt +wc -l $dict_dir/lexicon-en/words-en-oov.txt +wc -l $dict_dir/lexicon-en/lexicon-en-iv.txt pyver=`python --version 2>&1 | sed -e 's:.*\([2-3]\.[0-9]\+\).*:\1:g'` export PYTHONPATH=$PYTHONPATH:`pwd`/tools/g2p/lib/python${pyver}/site-packages @@ -83,21 +81,78 @@ fi echo "--- Preparing pronunciations for OOV words ..." python tools/g2p/lib/python${pyver}/site-packages/g2p.py \ - --model=conf/g2p_model --apply $dict_dir/vocab-en-oov.txt > $dict_dir/lexicon-en-oov.txt + --model=conf/g2p_model --apply $dict_dir/lexicon-en/words-en-oov.txt > $dict_dir/lexicon-en/lexicon-en-oov.txt -cat $dict_dir/lexicon-en-oov.txt $dict_dir/lexicon-en-iv.txt |\ - sort > $dict_dir/lexicon-en-phn.txt +cat $dict_dir/lexicon-en/lexicon-en-oov.txt $dict_dir/lexicon-en/lexicon-en-iv.txt |\ + sort > $dict_dir/lexicon-en/lexicon-en-phn.txt +mkdir $dict_dir/map +cat conf/cmu2pinyin | awk '{print $1;}' | sort -u > $dict_dir/map/cmu +cat conf/pinyin2cmu | awk -v cmu=$dict_dir/map/cmu \ + 'BEGIN{while((getline $dict_dir/map/cmu-used +cat $dict_dir/map/cmu | awk -v cmu=$dict_dir/map/cmu-used \ + 'BEGIN{while((getline $dict_dir/map/cmu-not-used + +gawk 'NR==FNR{words[$1]; next;} ($1 in words)' \ + $dict_dir/map/cmu-not-used conf/cmu2pinyin |\ + egrep -v '<.?s>' > $dict_dir/map/cmu-py + +cat $dict_dir/map/cmu-py | \ + perl -e ' + open(MAPS, $ARGV[0]) or die("could not open map file"); + my %py2ph; + foreach $line () { + @A = split(" ", $line); + $py = shift(@A); + $py2ph{$py} = [@A]; + } + my @entry; + while () { + @A = split(" ", $_); + @entry = (); + $W = shift(@A); + push(@entry, $W); + for($i = 0; $i < @A; $i++) { push(@entry, @{$py2ph{$A[$i]}}); } + print "@entry"; + print "\n"; + } +' conf/pinyin2cmu > $dict_dir/map/cmu-cmu +cat $dict_dir/lexicon-en/lexicon-en-phn.txt | \ + perl -e ' + open(MAPS, $ARGV[0]) or die("could not open map file"); + my %py2ph; + foreach $line () { + @A = split(" ", $line); + $py = shift(@A); + $py2ph{$py} = [@A]; + } + my @entry; + while () { + @A = split(" ", $_); + @entry = (); + $W = shift(@A); + push(@entry, $W); + for($i = 0; $i < @A; $i++) { + if (exists $py2ph{$A[$i]}) { push(@entry, @{$py2ph{$A[$i]}}); } + else {push(@entry, $A[$i])}; + } + print "@entry"; + print "\n"; + } +' $dict_dir/map/cmu-cmu > $dict_dir/lexicon-en/lexicon-en.txt -# produce pronunciations for chinese -if [ ! -f $dict_dir/cedict_1_0_ts_utf-8_mdbg.txt ]; then - wget -P $dict_dir http://www.mdbg.net/chindict/export/cedict/cedict_1_0_ts_utf-8_mdbg.txt.gz - gunzip $dict_dir/cedict_1_0_ts_utf-8_mdbg.txt.gz +##### produce pronunciations for chinese +if [ ! -f $dict_dir/cedict/cedict_1_0_ts_utf-8_mdbg.txt ]; then + mkdir -p $dict_dir/cedict + wget -P $dict_dir/cedict http://www.mdbg.net/chindict/export/cedict/cedict_1_0_ts_utf-8_mdbg.txt.gz + gunzip $dict_dir/cedict/cedict_1_0_ts_utf-8_mdbg.txt.gz fi -cat $dict_dir/cedict_1_0_ts_utf-8_mdbg.txt | grep -v '#' | awk -F '/' '{print $1}' |\ +cat $dict_dir/cedict/cedict_1_0_ts_utf-8_mdbg.txt | grep -v '#' | awk -F '/' '{print $1}' |\ perl -e ' while () { @A = split(" ", $_); @@ -109,27 +164,24 @@ cat $dict_dir/cedict_1_0_ts_utf-8_mdbg.txt | grep -v '#' | awk -F '/' '{print $1 } print "\n"; } - ' | sort -k1 > $dict_dir/ch-dict.txt + ' | sort -k1 > $dict_dir/cedict/ch-dict.txt echo "--- Searching for Chinese OOV words ..." gawk 'NR==FNR{words[$1]; next;} !($1 in words)' \ - $dict_dir/ch-dict.txt $dict_dir/vocab-ch.txt |\ - egrep -v '<.?s>' > $dict_dir/vocab-ch-oov.txt + $dict_dir/cedict/ch-dict.txt $dict_dir/lexicon-ch/words-ch.txt |\ + egrep -v '<.?s>' > $dict_dir/lexicon-ch/words-ch-oov.txt gawk 'NR==FNR{words[$1]; next;} ($1 in words)' \ - $dict_dir/vocab-ch.txt $dict_dir/ch-dict.txt |\ - egrep -v '<.?s>' > $dict_dir/lexicon-ch-iv.txt - -wc -l $dict_dir/vocab-ch-oov.txt -wc -l $dict_dir/lexicon-ch-iv.txt + $dict_dir/lexicon-ch/words-ch.txt $dict_dir/cedict/ch-dict.txt |\ + egrep -v '<.?s>' > $dict_dir/lexicon-ch/lexicon-ch-iv.txt +wc -l $dict_dir/lexicon-ch/words-ch-oov.txt +wc -l $dict_dir/lexicon-ch/lexicon-ch-iv.txt -# this -unset LC_ALL -# first make sure number of characters and pinyins -# are equal -cat $dict_dir/ch-dict.txt |\ +# validate Chinese dictionary and compose a char-based +# dictionary in order to get OOV pronunciations +cat $dict_dir/cedict/ch-dict.txt |\ perl -e ' use encoding utf8; while () { @@ -138,15 +190,38 @@ cat $dict_dir/ch-dict.txt |\ $proun_len = @A - 1 ; if ($word_len == $proun_len) {print $_;} } - ' > $dict_dir/ch-dict-1.txt + ' > $dict_dir/cedict/ch-dict-1.txt -cat $dict_dir/ch-dict-1.txt | awk '{print $1}' | sed -e 's/\(\S\)/\1\n/g' | grep -v '^$' > $dict_dir/ch-char.txt -cat $dict_dir/ch-dict-1.txt | awk '{for(i=2; i<=NF; i++) print $i}' | sed -e 's/ /\n/g' > $dict_dir/ch-char-pinyin.txt -wc -l $dict_dir/ch-char.txt -wc -l $dict_dir/ch-char-pinyin.txt -paste $dict_dir/ch-char.txt $dict_dir/ch-char-pinyin.txt | sort -u > $dict_dir/ch-char-dict.txt +# extract chars +cat $dict_dir/cedict/ch-dict-1.txt | awk '{print $1}' |\ + perl -e ' + use encoding utf8; + while () { + @A = split(" ", $_); + @chars = split("", $A[0]); + foreach (@chars) { + print "$_\n"; + } + } + ' | grep -v '^$' > $dict_dir/lexicon-ch/ch-char.txt -cat $dict_dir/ch-char-dict.txt |\ +# extract individual pinyins +cat $dict_dir/cedict/ch-dict-1.txt | awk '{for(i=2; i<=NF; i++) print $i}' | sed -e 's/ /\n/g' > $dict_dir/lexicon-ch/ch-char-pinyin.txt + +# first make sure number of characters and pinyins +# are equal, so that a char-based dictionary can +# be composed. +nchars=`wc -l < $dict_dir/lexicon-ch/ch-char.txt` +npinyin=`wc -l < $dict_dir/lexicon-ch/ch-char-pinyin.txt` +if [ $nchars -ne $npinyin ]; then + echo "Found $nchars chars and $npinyin pinyin. Please check!" + exit 1 +fi + +paste $dict_dir/lexicon-ch/ch-char.txt $dict_dir/lexicon-ch/ch-char-pinyin.txt | sort -u > $dict_dir/lexicon-ch/ch-char-dict.txt + +# create a multiple pronunciation dictionary +cat $dict_dir/lexicon-ch/ch-char-dict.txt |\ perl -e ' my $prev = ""; my $out_line = ""; @@ -161,14 +236,13 @@ cat $dict_dir/ch-char-dict.txt |\ $prev = $cur; } print $out_line; - ' > $dict_dir/ch-char-dict-1.txt - -cat $dict_dir/vocab-ch-oov.txt | awk -v w=$dict_dir/ch-char-dict-1.txt \ - 'BEGIN{while((getline0) dict[$1]=$2;} - {printf("%s", $1); for (i=1; i<=length($1); i++) { py=substr($1, i, 1); printf(" %s", dict[py]); } printf("\n"); }' \ - > $dict_dir/lexicon-ch-oov.txt + ' > $dict_dir/lexicon-ch/ch-char-dict-mp.txt +2 +# get lexicon for Chinese OOV words +perl local/create_oov_char_lexicon.pl $dict_dir/lexicon-ch/ch-char-dict-mp.txt $dict_dir/lexicon-ch/words-ch-oov.txt > $dict_dir/lexicon-ch/lexicon-ch-oov.txt -cat $dict_dir/lexicon-ch-oov.txt |\ +# seperate multiple prons for Chinese OOV lexicon +cat $dict_dir/lexicon-ch/lexicon-ch-oov.txt |\ perl -e ' my @entry; my @entry1; @@ -192,72 +266,18 @@ cat $dict_dir/lexicon-ch-oov.txt |\ print "\n"; } } - ' > $dict_dir/lexicon-ch-oov1.txt + ' > $dict_dir/lexicon-ch/lexicon-ch-oov-mp.txt -cat $dict_dir/lexicon-ch-oov1.txt $dict_dir/lexicon-ch-iv.txt |\ - awk '{if (NF > 1) print $0;}' > $dict_dir/lexicon-ch.txt +# compose IV and OOV lexicons for Chinese +cat $dict_dir/lexicon-ch/lexicon-ch-oov-mp.txt $dict_dir/lexicon-ch/lexicon-ch-iv.txt |\ + awk '{if (NF > 1) print $0;}' > $dict_dir/lexicon-ch/lexicon-ch.txt -cat $dict_dir/lexicon-ch.txt | sed -e 's/U:/V/g' | sed -e 's/ R\([0-9]\)/ ER\1/g'|\ - utils/pinyin_map.pl conf/pinyin2cmu > $dict_dir/lexicon-ch-cmu.txt +# convert Chinese pinyin to CMU format +cat $dict_dir/lexicon-ch/lexicon-ch.txt | sed -e 's/U:/V/g' | sed -e 's/ R\([0-9]\)/ ER\1/g'|\ + utils/pinyin_map.pl conf/pinyin2cmu > $dict_dir/lexicon-ch/lexicon-ch-cmu.txt -cat conf/cmu2pinyin | awk '{print $1;}' | sort -u > $dict_dir/cmu -cat conf/pinyin2cmu | awk -v cmu=$dict_dir/cmu \ - 'BEGIN{while((getline $dict_dir/cmu-used -cat $dict_dir/cmu | awk -v cmu=$dict_dir/cmu-used \ - 'BEGIN{while((getline $dict_dir/cmu-not-used - -gawk 'NR==FNR{words[$1]; next;} ($1 in words)' \ - $dict_dir/cmu-not-used conf/cmu2pinyin |\ - egrep -v '<.?s>' > $dict_dir/cmu-py - -cat $dict_dir/cmu-py | \ - perl -e ' - open(MAPS, $ARGV[0]) or die("could not open map file"); - my %py2ph; - foreach $line () { - @A = split(" ", $line); - $py = shift(@A); - $py2ph{$py} = [@A]; - } - my @entry; - while () { - @A = split(" ", $_); - @entry = (); - $W = shift(@A); - push(@entry, $W); - for($i = 0; $i < @A; $i++) { push(@entry, @{$py2ph{$A[$i]}}); } - print "@entry"; - print "\n"; - } -' conf/pinyin2cmu > $dict_dir/cmu-cmu - -cat $dict_dir/lexicon-en-phn.txt | \ - perl -e ' - open(MAPS, $ARGV[0]) or die("could not open map file"); - my %py2ph; - foreach $line () { - @A = split(" ", $line); - $py = shift(@A); - $py2ph{$py} = [@A]; - } - my @entry; - while () { - @A = split(" ", $_); - @entry = (); - $W = shift(@A); - push(@entry, $W); - for($i = 0; $i < @A; $i++) { - if (exists $py2ph{$A[$i]}) { push(@entry, @{$py2ph{$A[$i]}}); } - else {push(@entry, $A[$i])}; - } - print "@entry"; - print "\n"; - } -' $dict_dir/cmu-cmu > $dict_dir/lexicon-en.txt - -cat $dict_dir/lexicon-en.txt $dict_dir/lexicon-ch-cmu.txt |\ +# combine English and Chinese lexicons +cat $dict_dir/lexicon-en/lexicon-en.txt $dict_dir/lexicon-ch/lexicon-ch-cmu.txt |\ sort -u > $dict_dir/lexicon1.txt cat $dict_dir/lexicon1.txt | awk '{ for(n=2;n<=NF;n++){ phones[$n] = 1; }} END{for (p in phones) print p;}'| \ @@ -284,21 +304,14 @@ echo SIL > $dict_dir/optional_silence.txt # No "extra questions" in the input to this setup, as we don't # have stress or tone -#echo -n > $dict_dir/extra_questions.txt cat $dict_dir/silence_phones.txt| awk '{printf("%s ", $1);} END{printf "\n";}' > $dict_dir/extra_questions.txt || exit 1; cat $dict_dir/nonsilence_phones.txt | perl -e 'while(<>){ foreach $p (split(" ", $_)) { $p =~ m:^([^\d]+)(\d*)$: || die "Bad phone $_"; $q{$2} .= "$p "; } } foreach $l (values %q) {print "$l\n";}' \ >> $dict_dir/extra_questions.txt || exit 1; - # Add to the lexicon the silences, noises etc. (echo '!SIL SIL'; echo '[VOCALIZED-NOISE] SPN'; echo '[NOISE] NSN'; echo '[LAUGHTER] LAU'; echo ' SPN' ) | \ cat - $dict_dir/lexicon1.txt > $dict_dir/lexicon.txt || exit 1; - -export LC_ALL=C - - - exit 1; diff --git a/egs/hkust/s5/local/nnet/run_cnn.sh b/egs/hkust/s5/local/nnet/run_cnn.sh index 17fbc2d7c17..e0b7e10df86 100755 --- a/egs/hkust/s5/local/nnet/run_cnn.sh +++ b/egs/hkust/s5/local/nnet/run_cnn.sh @@ -82,7 +82,7 @@ if [ $stage -le 4 ]; then cnn_dbn=$dir/cnn_dbn.nnet { # Concatenate CNN layers and DBN, num_components=$(nnet-info $feature_transform | grep -m1 num-components | awk '{print $2;}') - nnet-concat "nnet-copy --remove-first-layers=$num_components $feature_transform_dbn - |" $dbn $cnn_dbn \ + nnet-concat "nnet-copy --remove-first-components=$num_components $feature_transform_dbn - |" $dbn $cnn_dbn \ 2>$dir/log/concat_cnn_dbn.log || exit 1 } # Train diff --git a/egs/hkust/s5/local/nnet/run_lstm.sh b/egs/hkust/s5/local/nnet/run_lstm.sh index 38c4474ac07..ec5d0e3a856 100755 --- a/egs/hkust/s5/local/nnet/run_lstm.sh +++ b/egs/hkust/s5/local/nnet/run_lstm.sh @@ -46,7 +46,7 @@ if [ $stage -le 1 ]; then $cuda_cmd $dir/log/train_nnet.log \ steps/nnet/train.sh --network-type lstm --learn-rate 0.0001 \ --cmvn-opts "--norm-means=true --norm-vars=true" --feat-type plain --splice 0 \ - --train-opts "--momentum 0.9 --halving-factor 0.5" \ + --train-tool-opts "--momentum 0.9 --halving-factor 0.5" \ --delta-opts "--delta-order=2" \ --train-tool "nnet-train-lstm-streams --num-stream=4 --targets-delay=5" \ --proto-opts "--num-cells 2000 --num-recurrent 750 --num-layers 1 --clip-gradient 5.0" \ diff --git a/egs/hkust/s5/local/nnet2/run_convnet.sh b/egs/hkust/s5/local/nnet2/run_convnet.sh index f5baab0dc5d..56b81c42a11 100755 --- a/egs/hkust/s5/local/nnet2/run_convnet.sh +++ b/egs/hkust/s5/local/nnet2/run_convnet.sh @@ -49,7 +49,7 @@ fi --num-threads 1 --minibatch-size 512 \ --mix-up 20000 --samples-per-iter 300000 \ --num-epochs 15 --delta-order 2 \ - --initial-effective-lrate 0.0005 --final-effective-lrate 0.000025 \ + --initial-effective-lrate 0.0001 --final-effective-lrate 0.00001 \ --num-jobs-initial 3 --num-jobs-final 8 --splice-width 5 \ --hidden-dim 2000 --num-filters1 128 --patch-dim1 7 --pool-size 3 \ --num-filters2 256 --patch-dim2 4 \ diff --git a/egs/hkust/s5/local/nnet3/run_ivector_common.sh b/egs/hkust/s5/local/nnet3/run_ivector_common.sh new file mode 100755 index 00000000000..046f723ca1e --- /dev/null +++ b/egs/hkust/s5/local/nnet3/run_ivector_common.sh @@ -0,0 +1,153 @@ +#!/bin/bash + +# this script contains some common (shared) parts of the run_nnet*.sh scripts. + +. cmd.sh + + +stage=0 +num_threads_ubm=32 +speed_perturb=true +use_sat_alignments=true + +set -e +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if [ "$use_sat_alignments" == "true" ] ; then + gmm_dir=exp/tri5a + align_script=steps/align_fmllr.sh +else + gmm_dir=exp/tri3a + align_script=steps/align_si.sh +fi + +if [ $stage -le 1 ]; then + # Create high-resolution MFCC features (with 40 cepstra instead of 13). + # this shows how you can split across multiple file-systems. we'll split the + # MFCC dir across multiple locations. You might want to be careful here, if you + # have multiple copies of Kaldi checked out and run the same recipe, not to let + # them overwrite each other. + mfccdir=mfcc_hires + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $mfccdir/storage ]; then + utils/create_split_dir.pl /export/b0{1,2,3,4}/$USER/kaldi-data/egs/hkust-$(date +'%m_%d_%H_%M')/s5/$mfccdir/storage $mfccdir/storage + fi + + for datadir in train dev; do + utils/copy_data_dir.sh data/$datadir data/${datadir}_hires + if [ "$datadir" == "train" ]; then + dir=data/train_hires + cat $dir/wav.scp | python -c " +import sys, os, subprocess, re, random +scale_low = 1.0/8 +scale_high = 2.0 +for line in sys.stdin.readlines(): + if len(line.strip()) == 0: + continue + print '{0} sox --vol {1} -t wav - -t wav - |'.format(line.strip(), random.uniform(scale_low, scale_high)) +"| sort -k1,1 -u > $dir/wav.scp_scaled || exit 1; + mv $dir/wav.scp $dir/wav.scp_nonorm + mv $dir/wav.scp_scaled $dir/wav.scp + fi + + steps/make_mfcc.sh --nj 70 --mfcc-config conf/mfcc_hires.conf \ + --cmd "$train_cmd" data/${datadir}_hires exp/make_hires/$datadir $mfccdir || exit 1; + steps/compute_cmvn_stats.sh data/${datadir}_hires exp/make_hires/$datadir $mfccdir || exit 1; + done +fi + +if [ $stage -le 2 ]; then + # Train a system just for its LDA+MLLT transform. We use --num-iters 13 + # because after we get the transform (12th iter is the last), any further + # training is pointless. + steps/train_lda_mllt.sh --cmd "$train_cmd" --num-iters 13 \ + --realign-iters "" \ + --splice-opts "--left-context=3 --right-context=3" \ + 5000 10000 data/train_hires data/lang \ + ${gmm_dir}_ali exp/nnet3/tri5 +fi + +if [ $stage -le 3 ]; then + steps/online/nnet2/train_diag_ubm.sh --cmd "$train_cmd" --nj 30 \ + --num-frames 700000 \ + --num-threads $num_threads_ubm \ + data/train_hires 512 exp/nnet3/tri5 exp/nnet3/diag_ubm +fi + +if [ $stage -le 4 ]; then + # iVector extractors can in general be sensitive to the amount of data, but + # this one has a fairly small dim (defaults to 100) + steps/online/nnet2/train_ivector_extractor.sh --cmd "$train_cmd" --nj 10 \ + data/train_hires exp/nnet3/diag_ubm exp/nnet3/extractor || exit 1; +fi + +if [ $stage -le 5 ] && [ "$speed_perturb" == "true" ]; then + # Although the nnet will be trained by high resolution data, + # we still have to perturbe the normal data to get the alignment + # _sp stands for speed-perturbed + utils/perturb_data_dir_speed.sh 0.9 data/train data/temp1 + utils/perturb_data_dir_speed.sh 1.0 data/train data/temp2 + utils/perturb_data_dir_speed.sh 1.1 data/train data/temp3 + utils/combine_data.sh --extra-files utt2uniq data/train_sp data/temp1 data/temp2 data/temp3 + rm -r data/temp1 data/temp2 data/temp3 + + mfccdir=mfcc_perturbed + for x in train_sp; do + steps/make_mfcc.sh --cmd "$train_cmd" --nj 70 \ + data/$x exp/make_mfcc/$x $mfccdir || exit 1; + steps/compute_cmvn_stats.sh data/$x exp/make_mfcc/$x $mfccdir || exit 1; + done + utils/fix_data_dir.sh data/train_sp + + $align_script --nj 30 --cmd "$train_cmd" \ + data/train_sp data/lang $gmm_dir ${gmm_dir}_sp_ali || exit 1 + + # Now perturb the high resolution daa + utils/copy_data_dir.sh data/train_sp data/train_sp_hires + mfccdir=mfcc_perturbed_hires + for x in train_sp_hires; do + steps/make_mfcc.sh --cmd "$train_cmd" --nj 70 --mfcc-config conf/mfcc_hires.conf \ + data/$x exp/make_hires/$x $mfccdir || exit 1; + steps/compute_cmvn_stats.sh data/$x exp/make_hires/$x $mfccdir || exit 1; + done + utils/fix_data_dir.sh data/train_sp_hires +fi + +if [ "$speed_perturb" == "true" ]; then + train_set=train_sp +else + train_set=train +fi + +if [ $stage -le 6 ]; then + rm -f exp/nnet3/.error 2>/dev/null + ivectordir=exp/nnet3/ivectors_${train_set}_hires + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $ivectordir/storage ]; then + utils/create_split_dir.pl /export/b0{1,2,3,4}/$USER/kaldi-data/egs/hkust-$(date +'%m_%d_%H_%M')/s5/$ivectordir/storage $ivectordir/storage + fi + # We extract iVectors on all the train data, which will be what we train the + # system on. With --utts-per-spk-max 2, the script. pairs the utterances + # into twos, and treats each of these pairs as one speaker. Note that these + # are extracted 'online'. + + # having a larger number of speakers is helpful for generalization, and to + # handle per-utterance decoding well (iVector starts at zero). + steps/online/nnet2/copy_data_dir.sh --utts-per-spk-max 2 data/${train_set}_hires data/${train_set}_hires_max2 + steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 30 \ + data/${train_set}_hires_max2 \ + exp/nnet3/extractor \ + exp/nnet3/ivectors_${train_set}_hires \ + || touch exp/nnet3/.error + [ -f exp/nnet3/.error ] && echo "$0: error extracting iVectors." && exit 1; +fi + +if [ $stage -le 7 ]; then + rm -f exp/nnet3/.error 2>/dev/null + steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 8 \ + data/dev_hires exp/nnet3/extractor exp/nnet3/ivectors_dev || touch exp/nnet3/.error & + wait + [ -f exp/nnet3/.error ] && echo "$0: error extracting iVectors." && exit 1; +fi + +exit 0; diff --git a/egs/hkust/s5/local/nnet3/run_lstm.sh b/egs/hkust/s5/local/nnet3/run_lstm.sh new file mode 100755 index 00000000000..061040e55c1 --- /dev/null +++ b/egs/hkust/s5/local/nnet3/run_lstm.sh @@ -0,0 +1,168 @@ +#!/bin/bash + +# this is a basic lstm script + +# At this script level we don't support not running on GPU, as it would be painfully slow. +# If you want to run without GPU you'd have to call lstm/train.sh with --gpu false, +# --num-threads 16 and --minibatch-size 128. +set -e + +stage=0 +train_stage=-10 +use_sat_alignments=true +affix= +speed_perturb=true + +# LSTM options +splice_indexes="-2,-1,0,1,2 0 0" +lstm_delay=" -1 -2 -3 " +label_delay=5 +num_lstm_layers=3 +cell_dim=1024 +hidden_dim=1024 +recurrent_projection_dim=256 +non_recurrent_projection_dim=256 +chunk_width=20 +chunk_left_context=40 +clipping_threshold=10.0 +norm_based_clipping=true +common_egs_dir= + +# natural gradient options +ng_per_element_scale_options= +ng_affine_options= +num_epochs=4 + +# training options +initial_effective_lrate=0.0002 +final_effective_lrate=0.00002 +num_jobs_initial=2 +num_jobs_final=12 +shrink=0.98 +momentum=0.5 +adaptive_shrink=true +num_chunk_per_minibatch=100 +num_bptt_steps=20 +samples_per_iter=20000 +remove_egs=true + +# feature options +use_ivectors=true + +#decode options +extra_left_context= +frames_per_chunk= + +# End configuration section. + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$dir/scoring/text.filt -unset LC_ALL #for character error rate cat $dir/scoring/text.filt | awk '{ print $1}' > $dir/scoring/utt_id -cat $dir/scoring/text.filt | awk '{{for (i = 2; i <= NF; i++) printf(" %s", $i);} printf("\n"); }' | sed -e 's/\(\S\)/\1 /g' > $dir/scoring/utt_tra +cat $dir/scoring/text.filt | awk '{{for (i = 2; i <= NF; i++) printf(" %s", $i);} printf("\n"); }' |\ + perl -e ' + use encoding utf8; + while () { + @words = split(" ", $_); + foreach (@words) { + @chars = split("", $_); + foreach (@chars) { + print "$_ "; + } + } + print "\n"; + } + ' > $dir/scoring/utt_tra paste $dir/scoring/utt_id $dir/scoring/utt_tra > $dir/scoring/char.filt for lmwt in `seq $min_lmwt $max_lmwt`; do cat $dir/scoring/$lmwt.txt | awk '{ print $1}' > $dir/scoring/utt_id - cat $dir/scoring/$lmwt.txt | awk '{{for (i = 2; i <= NF; i++) printf(" %s", $i);} printf("\n"); }' | sed -e 's/\(\S\)/\1 /g' > $dir/scoring/utt_tra + cat $dir/scoring/$lmwt.txt | awk '{{for (i = 2; i <= NF; i++) printf(" %s", $i);} printf("\n"); }' |\ + perl -e ' + use encoding utf8; + while () { + @words = split(" ", $_); + foreach (@words) { + @chars = split("", $_); + foreach (@chars) { + print "$_ "; + } + } + print "\n"; + } + ' > $dir/scoring/utt_tra paste $dir/scoring/utt_id $dir/scoring/utt_tra > $dir/scoring/${lmwt}.char done rm $dir/scoring/utt_tra $dir/scoring/utt_id -export LC_ALL=C - $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/score.LMWT.log \ compute-wer --text --mode=present \ ark:$dir/scoring/text.filt ark:$dir/scoring/LMWT.txt ">&" $dir/wer_LMWT || exit 1; diff --git a/egs/hkust/s5/path.sh b/egs/hkust/s5/path.sh index e49bed09e8f..5adfbeec7c2 100755 --- a/egs/hkust/s5/path.sh +++ b/egs/hkust/s5/path.sh @@ -1,4 +1,6 @@ export KALDI_ROOT=`pwd`/../../.. #export KALDI_ROOT=/home/dpovey/kaldi-trunk-test -export PATH=$PWD/utils/:$KALDI_ROOT/src/bin:$KALDI_ROOT/tools/openfst/bin:$KALDI_ROOT/src/fstbin/:$KALDI_ROOT/src/gmmbin/:$KALDI_ROOT/src/featbin/:$KALDI_ROOT/src/lm/:$KALDI_ROOT/src/sgmmbin/:$KALDI_ROOT/src/sgmm2bin/:$KALDI_ROOT/src/fgmmbin/:$KALDI_ROOT/src/latbin/:$PWD:$PATH +export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH +[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1 +. $KALDI_ROOT/tools/config/common_path.sh export LC_ALL=C diff --git a/egs/iban/README b/egs/iban/README new file mode 100644 index 00000000000..4385ed37201 --- /dev/null +++ b/egs/iban/README @@ -0,0 +1,84 @@ +### +# Iban Data collected by Sarah Samson Juan and Laurent Besacier +# Prepared by Sarah Samson Juan and Laurent Besacier +# Created in GETALP, Grenoble, France +### + + +## INTRODUCTION ## +This package has iban text and speech corpora used for Automatic Speech Recognition (ASR) experiments. Data is available in the subdirectories of /data. The subdirectories contain: +a. train - train transcript for training ASR system using Kaldi ASR (http://kaldi.sourceforge.net/) +b. test - test transcript for testing ASR system (also Kaldi ASR format) +c. wav - speech corpus + +We have provided text corpus and language model in the /LM directory, while, the pronunciation dictionary in /lang directory. + +###PUBLICATION ON IBAN DATA AND ASR ##### +Details on the corpora and the our experiments on iban ASR can be found in the following list of publication. We appreciate if you cite them if you intend to publish. + +@inproceedings{Juan14, + Author = {Sarah Samson Juan and Laurent Besacier and Solange Rossato}, + Booktitle = {Proceedings of Workshop for Spoken Language Technology for Under-resourced (SLTU)}, + Month = {May}, + Title = {Semi-supervised G2P bootstrapping and its application to ASR for a very under-resourced language: Iban}, + Year = {2014}} + + +@inproceedings{Juan2015, + Title = {Using Resources from a closely-Related language to develop ASR for a very under-resourced Language: A case study for Iban}, + Author = {Sarah Samson Juan and Laurent Besacier and Benjamin Lecouteux and Mohamed Dyab}, + Booktitle = {Proceedings of INTERSPEECH}, + Year = {2015}, + Address = {Dresden, Germany}, + Month = {September}} + + +###IBAN SPEECH CORPUS +News data provided by a local radio station in Sarawak, Malaysia. + +Directory: data/train +Files: text (training transcript), wav.scp (file id and path to audio file), utt2spk (file id and audio id), spk2utt(audio id and file id), wav (.wav files). +For more information about the format, please refer to Kaldi website http://kaldi.sourceforge.net/data_prep.html +Description: training data in Kaldi format about 7 hours. Note: The path of wav files in wav.scp MUST BE MODIFIED to point to the actual location. + +Directory: data/test +Files: text (test transcript), wav.scp (file id and path to audio file), utt2spk (file id and audio id), spk2utt(audio id and file id), wav (.wav files). +Description: testing data in Kaldi format about 1 hour. Note: The path of wav files in wav.scp MUST BE MODIFIED to point to the actual location. + +The audio files have the format: +ib[m|f]_SPK_UTT where, m refers to male and f refers to female speaker, SPK denotes speaker id and UTT is the utterance id. + +#### IBAN TEXT CORPUS +Directory: /LM/ +Files: iban-bp-2012.txt, iban-lm-o3.arpa + +# /iban-bp-2012.txt +Contains 2 M Words. Full text data crawled from an online newspaper and cleaned as much as we could. + +# /iban-lm-o3.arpa +The language model build on SRILM (http://www.speech.sri.com/projects/srilm/) using iban-bp-2012.txt + + +#### LEXICON/PRONUNCIATION DICTIONARY +Directory: /lang +Files : lexicon.txt (lexicon), nonsilence_phones.txt (speech phones), optional_silence.txt (silence phone) +Description: lexicon contains words and their respective pronunciation, non-speech sound and noise in Kaldi format. Details on the development of the dictionary can be found in our papers. (For this package, we provided the Iban-Hybrid version.) + + +#TO DOWNLOAD THE REPOSITORY + +svn co https://github.com/sarahjuan/iban + +### SCRIPTS +In /kaldi-scripts, you can find all scripts that can be used to train and test models from the existing data and lang directory. Note: Path needs to changed to make it work in your own directory. + +You can launch run.sh to prepare data & language model, make mfccs and train acoustic models. + + +### WER RESULTS OBTAINED USING OUR CORPORA AND SETTINGS. RESULTS OBTAINED AFTER UPDATING TEST TRANSCRIPT. THE ONES REPORTED IN OUR PAPERS WERE BEFORE THIS UPDATE## + +See the latest results in s5/RESULTS file (they will not match the results from the paper) + +##ACKNOWLEDGEMENT ### +We would like to thank the Ministry of Higher Education Malaysia for providing financial support to conduct this study. We also thank The Borneo Post news agency for providing online materials for building the text corpus and also to Radio Televisyen Malaysia (RTM), Sarawak, Malaysia, for providing the news data. + diff --git a/egs/iban/s5/RESULTS b/egs/iban/s5/RESULTS new file mode 100644 index 00000000000..09077fdcba8 --- /dev/null +++ b/egs/iban/s5/RESULTS @@ -0,0 +1,16 @@ +%WER 15.32 [ 1686 / 11006, 220 ins, 338 del, 1128 sub ] exp/sgmm2_5b2/decode_dev.big/wer_18_0.0 +%WER 15.36 [ 1691 / 11006, 214 ins, 322 del, 1155 sub ] exp/nnet3/nnet_tdnn_h_sp_4_850_170/decode_dev.big/wer_18_0.0 +%WER 15.50 [ 1706 / 11006, 212 ins, 327 del, 1167 sub ] exp/nnet3/nnet_tdnn_h_sp_4_850_170/decode_dev.rescored/wer_18_0.0 +%WER 15.84 [ 1743 / 11006, 242 ins, 332 del, 1169 sub ] exp/sgmm2_5b2/decode_dev.rescored/wer_15_0.0 +%WER 17.45 [ 1921 / 11006, 252 ins, 326 del, 1343 sub ] exp/nnet3/nnet_tdnn_h_sp_4_850_170/decode_dev/wer_15_0.0 +%WER 17.55 [ 1932 / 11006, 266 ins, 323 del, 1343 sub ] exp/sgmm2_5b2/decode_dev/wer_13_0.0 +%WER 19.08 [ 2100 / 11006, 245 ins, 503 del, 1352 sub ] exp/tri3b/decode_dev.rescored/wer_20_0.0 +%WER 20.92 [ 2302 / 11006, 263 ins, 518 del, 1521 sub ] exp/tri3b/decode_dev/wer_19_0.0 +%WER 24.19 [ 2662 / 11006, 243 ins, 900 del, 1519 sub ] exp/tri2b/decode_dev.rescored/wer_14_0.0 +%WER 25.26 [ 2780 / 11006, 294 ins, 736 del, 1750 sub ] exp/tri3b/decode_dev.si/wer_16_0.0 +%WER 26.44 [ 2910 / 11006, 292 ins, 832 del, 1786 sub ] exp/tri2b/decode_dev/wer_13_0.0 +%WER 30.99 [ 3411 / 11006, 245 ins, 1391 del, 1775 sub ] exp/tri1/decode_dev.rescored/wer_12_0.0 +%WER 33.31 [ 3666 / 11006, 260 ins, 1428 del, 1978 sub ] exp/tri1/decode_dev/wer_12_0.0 +%WER 33.81 [ 3721 / 11006, 241 ins, 1585 del, 1895 sub ] exp/tri2a/decode_dev.rescored/wer_11_0.0 +%WER 35.69 [ 3928 / 11006, 243 ins, 1750 del, 1935 sub ] exp/tri2a/decode_dev/wer_12_0.0 +%WER 39.41 [ 4338 / 11006, 190 ins, 1237 del, 2911 sub ] exp/mono/decode_dev/wer_11_0.0 diff --git a/egs/iban/s5/cmd.sh b/egs/iban/s5/cmd.sh new file mode 100755 index 00000000000..487a2244c04 --- /dev/null +++ b/egs/iban/s5/cmd.sh @@ -0,0 +1,5 @@ +export train_cmd="run.pl --max-jobs-run 32" +export decode_cmd="run.pl --max-jobs-run 32" + +#export train_cmd="queue.pl" +#export decode_cmd="queue.pl --mem 4G" diff --git a/egs/iban/s5/conf/decode.config b/egs/iban/s5/conf/decode.config new file mode 100644 index 00000000000..10b0eee900b --- /dev/null +++ b/egs/iban/s5/conf/decode.config @@ -0,0 +1,4 @@ +# Use wider-than-normal decoding beams for RM. +first_beam=16.0 +beam=20.0 +lattice_beam=10.0 diff --git a/egs/iban/s5/conf/decode_dnn.config b/egs/iban/s5/conf/decode_dnn.config new file mode 100644 index 00000000000..e69de29bb2d diff --git a/egs/iban/s5/conf/mfcc.conf b/egs/iban/s5/conf/mfcc.conf new file mode 100644 index 00000000000..7361509099f --- /dev/null +++ b/egs/iban/s5/conf/mfcc.conf @@ -0,0 +1 @@ +--use-energy=false # only non-default option. diff --git a/egs/iban/s5/conf/mfcc_hires.conf b/egs/iban/s5/conf/mfcc_hires.conf new file mode 100644 index 00000000000..434834a6725 --- /dev/null +++ b/egs/iban/s5/conf/mfcc_hires.conf @@ -0,0 +1,10 @@ +# config for high-resolution MFCC features, intended for neural network training +# Note: we keep all cepstra, so it has the same info as filterbank features, +# but MFCC is more easily compressible (because less correlated) which is why +# we prefer this method. +--use-energy=false # use average of log energy, not energy. +--num-mel-bins=40 # similar to Google's setup. +--num-ceps=40 # there is no dimensionality reduction. +--low-freq=20 # low cutoff frequency for mel bins... this is high-bandwidth data, so + # there might be some information at the low end. +--high-freq=-400 # high cutoff frequently, relative to Nyquist of 8000 (=7600) diff --git a/egs/iban/s5/conf/online_cmvn.conf b/egs/iban/s5/conf/online_cmvn.conf new file mode 100644 index 00000000000..7748a4a4dd3 --- /dev/null +++ b/egs/iban/s5/conf/online_cmvn.conf @@ -0,0 +1 @@ +# configuration file for apply-cmvn-online, used in the script ../local/run_online_decoding.sh diff --git a/egs/iban/s5/local/arpa2G.sh b/egs/iban/s5/local/arpa2G.sh new file mode 100755 index 00000000000..dddd7eb9097 --- /dev/null +++ b/egs/iban/s5/local/arpa2G.sh @@ -0,0 +1,115 @@ +#!/bin/bash +# Copyright 2013-2014 Johns Hopkins University (authors: Yenda Trmal, Daniel Povey) + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +# MERCHANTABLITY OR NON-INFRINGEMENT. +# See the Apache 2 License for the specific language governing permissions and +# limitations under the License. + +#Simple utility script to convert the gzipped ARPA lm into a G.fst file + + +oov_prob_file= +unk_fraction= +cleanup=true +#end configuration section. + + + +echo $0 $@ + +[ -f ./path.sh ] && . ./path.sh +[ -f ./cmd.sh ] && . ./cmd.sh +. parse_options.sh || exit 1; + +if [ $# -ne 3 ]; then + echo "Usage: $0 [options] " + echo "Options: --oov-prob-file # e.g. data/local/oov2prob" + echo " # with this option it will replace with OOVs in G.fst." + exit 1; +fi + +set -e #Exit on non-zero return code from any command +set -o pipefail #Exit if any of the commands in the pipeline will + #return non-zero return code + +lmfile=$1 +langdir=$2 +destdir=$3 + +mkdir $destdir 2>/dev/null || true + + +if [ ! -z "$oov_prob_file" ]; then + if [ ! -s "$oov_prob_file" ]; then + echo "$0: oov-prob file $oov_prob_file does not exist" + exit 1; + fi + if [ -z "$unk_fraction" ]; then + echo "--oov-prob option requires --unk-fraction option"; + exit 1; + fi + + min_prob=$(gunzip -c $lmfile | perl -e ' $minlogprob = 0.0; + while() { if (m/\\(\d)-grams:/) { $order = $1; } + if ($order == 1) { @A = split; + if ($A[0] < $minlogprob && $A[0] != -99) { $minlogprob = $A[0]; }}} print $minlogprob') + echo "Minimum prob in LM file is $min_prob" + + echo "$0: creating LM file with unk words, using $oov_prob_file, in $destdir/lm_tmp.gz" + gunzip -c $lmfile | \ + perl -e ' ($oov_prob_file,$min_prob,$unk_fraction) = @ARGV; $ceilinged=0; + $min_prob < 0.0 || die "Bad min_prob"; # this is a log-prob + $unk_fraction > 0.0 || die "Bad unk_fraction"; # this is a prob + open(F, "<$oov_prob_file") || die "opening oov file"; + while () { push @OOVS, $_; } + $num_oovs = @F; + while() { + if (m/^ngram 1=(\d+)/) { $n = $1 + $num_oovs; print "ngram 1=$n\n"; } + else { print; } # print all lines unchanged except the one that says ngram 1=X. + if (m/^\\1-grams:$/) { + foreach $l (@OOVS) { + @A = split(" ", $l); + @A == 2 || die "bad line in oov2prob: $_;"; + ($word, $prob) = @A; + $log10prob = (log($prob * $unk_fraction) / log(10.0)); + if ($log10prob > $min_prob) { $log10prob = $min_prob; $ceilinged++;} + print "$log10prob $word\n"; + } + }} print STDERR "Ceilinged $ceilinged unk-probs\n";' \ + $oov_prob_file $min_prob $unk_fraction | gzip -c > $destdir/lm_tmp.gz + lmfile=$destdir/lm_tmp.gz +fi + +if [[ $lmfile == *.bz2 ]] ; then + decompress="bunzip2 -c $lmfile" +elif [[ $lmfile == *.gz ]] ; then + decompress="gunzip -c $lmfile" +else + decompress="cat $lmfile" +fi + +$decompress | \ + grep -v ' ' | grep -v ' ' | grep -v ' ' | \ + arpa2fst - | \ + fstprint | \ + utils/eps2disambig.pl | \ + utils/s2eps.pl | \ + fstcompile --isymbols=$langdir/words.txt \ + --osymbols=$langdir/words.txt --keep_isymbols=false --keep_osymbols=false | \ + fstrmepsilon | fstarcsort --sort_type=olabel > $destdir/G.fst || exit 1 +fstisstochastic $destdir/G.fst || true; + +if $cleanup; then + rm $destdir/lm_tmp.gz 2>/dev/null || true; +fi + +exit 0 diff --git a/egs/iban/s5/local/nnet3/run_ivector_common.sh b/egs/iban/s5/local/nnet3/run_ivector_common.sh new file mode 100755 index 00000000000..0aa7a301dfe --- /dev/null +++ b/egs/iban/s5/local/nnet3/run_ivector_common.sh @@ -0,0 +1,95 @@ +#!/bin/bash + +## Script was adapted from WSJ (login) and RM (some settings) + +. cmd.sh +mfccdir=mfcc + +stage=1 + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + + +if [ $stage -le 1 ]; then + for datadir in train; do + utils/perturb_data_dir_speed.sh 0.9 data/${datadir} data/temp1 + utils/perturb_data_dir_speed.sh 1.1 data/${datadir} data/temp2 + utils/combine_data.sh data/${datadir}_tmp data/temp1 data/temp2 + utils/validate_data_dir.sh --no-feats data/${datadir}_tmp + rm -r data/temp1 data/temp2 + + mfccdir=mfcc_perturbed + steps/make_mfcc.sh --cmd "$train_cmd" --nj 17 \ + data/${datadir}_tmp exp/make_mfcc/${datadir}_tmp $mfccdir || exit 1; + steps/compute_cmvn_stats.sh data/${datadir}_tmp exp/make_mfcc/${datadir}_tmp $mfccdir || exit 1; + utils/fix_data_dir.sh data/${datadir}_tmp + + utils/copy_data_dir.sh --spk-prefix sp1.0- --utt-prefix sp1.0- data/${datadir} data/temp0 + utils/combine_data.sh data/${datadir}_sp data/${datadir}_tmp data/temp0 + utils/fix_data_dir.sh data/${datadir}_sp + rm -r data/temp0 data/${datadir}_tmp + done +fi + +mkdir -p exp/nnet3 + +if [ $stage -le 2 ]; then + steps/align_fmllr.sh --nj 16 --cmd "$train_cmd" \ + data/train_sp data/lang exp/tri3b exp/nnet3/tri3b_ali_sp || exit 1 +fi + +mfccdir=mfcc_hires +if [ $stage -le 3 ]; then + utils/copy_data_dir.sh data/train_sp data/train_hires || exit 1 + steps/make_mfcc.sh --nj 16 --mfcc-config conf/mfcc_hires.conf \ + --cmd "$train_cmd" data/train_hires exp/make_hires/train $mfccdir || exit 1; + steps/compute_cmvn_stats.sh data/train_hires exp/make_hires/train $mfccdir || exit 1; + + for datadir in dev; do + utils/copy_data_dir.sh data/$datadir data/${datadir}_hires || exit 1 + steps/make_mfcc.sh --nj 6 --mfcc-config conf/mfcc_hires.conf \ + --cmd "$train_cmd" data/${datadir}_hires exp/make_hires/$datadir $mfccdir || exit 1; + steps/compute_cmvn_stats.sh data/${datadir}_hires exp/make_hires/$datadir $mfccdir || exit 1; + done +fi + +if [ $stage -le 4 ]; then + # Train a small system just for its LDA+MLLT transform. We use --num-iters 13 + # because after we get the transform (12th iter is the last), any further + # training is pointless. + steps/train_lda_mllt.sh --cmd "$train_cmd" --num-iters 13 \ + --realign-iters "" --splice-opts "--left-context=3 --right-context=3" \ + 5000 10000 data/train_hires data/lang \ + exp/nnet3/tri3b_ali_sp exp/nnet3/tri5b || exit 1 +fi + +if [ $stage -le 5 ]; then + steps/online/nnet2/train_diag_ubm.sh --cmd "$train_cmd" --nj 16 --num-frames 200000 \ + data/train_hires 256 exp/nnet3/tri5b exp/nnet3/diag_ubm || exit 1 +fi + +if [ $stage -le 6 ]; then + # even though $nj is just 10, each job uses multiple processes and threads. + steps/online/nnet2/train_ivector_extractor.sh --cmd "$train_cmd" \ + --nj 10 --num-processes 1 --num-threads 2 --ivector-dim 50\ + data/train_hires exp/nnet3/diag_ubm exp/nnet3/extractor || exit 1; +fi + +if [ $stage -le 7 ]; then + # having a larger number of speakers is helpful for generalization, and to + # handle per-utterance decoding well (iVector starts at zero). + steps/online/nnet2/copy_data_dir.sh --utts-per-spk-max 2 data/train_hires \ + data/train_hires_max2 || exit 1 + + steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 16\ + data/train_hires_max2 exp/nnet3/extractor exp/nnet3/ivectors_train || exit 1 +fi + +if [ $stage -le 8 ]; then + steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 6 \ + data/dev_hires exp/nnet3/extractor exp/nnet3/ivectors_dev || exit 1 +fi + +exit 0; diff --git a/egs/iban/s5/local/nnet3/run_tdnn.sh b/egs/iban/s5/local/nnet3/run_tdnn.sh new file mode 100755 index 00000000000..ac0e2efa1d0 --- /dev/null +++ b/egs/iban/s5/local/nnet3/run_tdnn.sh @@ -0,0 +1,90 @@ +#!/bin/bash + +# Combined from from WSJ + RM + +# this is the standard "tdnn" system, built in nnet3; it's what we use to +# call multi-splice. + +. ./cmd.sh + + +# At this script level we don't support not running on GPU, as it would be painfully slow. +# If you want to run without GPU you'd have to call train_tdnn.sh with --gpu false, +# --num-threads 16 and --minibatch-size 128. + +stage=1 +train_stage=-10 +dir=exp/nnet3/tdnn_1 + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + + +if ! cuda-compiled; then + cat <&2 "The script $0 expects one parameter -- the location of the Iban corpus" + exit 1 +fi +if [ ! -d "$corpus" ] ; then + echo >&2 "The directory $corpus does not exist" +fi + + +echo "Preparing train and test data" +mkdir -p data data/local data/train data/dev + +for x in train dev; do + echo "Copy spk2utt, utt2spk, wav.scp, text for $x" + cp $corpus/data/$x/text data/$x/text || exit 1; + cp $corpus/data/$x/spk2utt data/$x/spk2utt || exit 1; + cp $corpus/data/$x/utt2spk data/$x/utt2spk || exit 1; + + # the corpus wav.scp contains physical paths, so we just re-generate + # the file again from scratchn instead of figuring out how to edit it + for rec in $(awk '{print $1}' $corpus/data/$x/text) ; do + spk=${rec%_*} + filename=$corpus/data/wav/$spk/${rec}.wav + if [ ! -f "$filename" ] ; then + echo >&2 "The file $filename could not be found ($rec)" + exit 1 + fi + # we might want to store physical paths as a general rule + filename=$(readlink -f $filename) + echo "$rec $filename" + done > data/$x/wav.scp + + # fix_data_dir.sh fixes common mistakes (unsorted entries in wav.scp, + # duplicate entries and so on). Also, it regenerates the spk2utt from + # utt2sp + utils/fix_data_dir.sh data/$x +done + +echo "Copying language model" +if [ -f $corpus/LM/iban-lm-o3.arpa.tar.gz ] ; then + tar zxf $corpus/LM/iban-lm-o3.arpa.tar.gz -C data/local/ +fi + +echo "Data preparation completed." + diff --git a/egs/iban/s5/local/prepare_dict.sh b/egs/iban/s5/local/prepare_dict.sh new file mode 100755 index 00000000000..ebec12bc171 --- /dev/null +++ b/egs/iban/s5/local/prepare_dict.sh @@ -0,0 +1,31 @@ +#!/bin/bash +# Copyright 2015-2016 Sarah Flora Juan +# Copyright 2016 Johns Hopkins University (Author: Yenda Trmal) +# Apache 2.0 + +corpus=$1 +if [ -z "$corpus" ] ; then + echo >&2 "The script $0 expects one parameter -- the location of the Iban corpus" + exit 1 +fi +if [ ! -d "$corpus" ] ; then + echo >&2 "The directory $corpus does not exist" +fi + +mkdir -p data/lang data/local/dict + + +cp $corpus/lang/dict/lexicon.txt data/local/dict/lexicon.txt +cat data/local/dict/lexicon.txt | \ + perl -ane 'print join("\n", @F[1..$#F]) . "\n"; ' | \ + sort -u | grep -v 'SIL' > data/local/dict/nonsilence_phones.txt + + +touch data/local/dict/extra_questions.txt +touch data/local/dict/optional_silence.txt + +echo "SIL" > data/local/dict/optional_silence.txt +echo "SIL" > data/local/dict/silence_phones.txt +echo "" > data/local/dict/oov.txt + +echo "Dictionary preparation succeeded" diff --git a/egs/iban/s5/local/prepare_lm.sh b/egs/iban/s5/local/prepare_lm.sh new file mode 100755 index 00000000000..a19dc18f566 --- /dev/null +++ b/egs/iban/s5/local/prepare_lm.sh @@ -0,0 +1,42 @@ +#!/bin/bash +# Copyright 2015-2016 Sarah Flora Juan +# Copyright 2016 Johns Hopkins University (Author: Yenda Trmal) +# Apache 2.0 + +set -e -o pipefail + +# To create G.fst from ARPA language model +. ./path.sh || die "path.sh expected"; + +local/train_lms_srilm.sh --train-text data/train/text data/ data/srilm + +nl -nrz -w10 corpus/LM/iban-bp-2012.txt | sort -R > data/local/external_text +local/train_lms_srilm.sh --train-text data/local/external_text data/ data/srilm_external + +# let's do ngram interpolation of the previous two LMs +# the lm.gz is always symlink to the model with the best perplexity, so we use that + +mkdir -p data/srilm_interp +for w in 0.9 0.8 0.7 0.6 0.5; do + ngram -lm data/srilm/lm.gz -mix-lm data/srilm_external/lm.gz \ + -lambda $w -write-lm data/srilm_interp/lm.${w}.gz + echo -n "data/srilm_interp/lm.${w}.gz " + ngram -lm data/srilm_interp/lm.${w}.gz -ppl data/srilm/dev.txt | paste -s +done | sort -k15,15g > data/srilm_interp/perplexities.txt + +# for basic decoding, let's use only a trigram LM +[ -d data/lang_test/ ] && rm -rf data/lang_test +cp -R data/lang data/lang_test +lm=$(cat data/srilm/perplexities.txt | grep 3gram | head -n1 | awk '{print $1}') +local/arpa2G.sh $lm data/lang_test data/lang_test + +# for decoding using bigger LM let's find which interpolated gave the most improvement +[ -d data/lang_big ] && rm -rf data/lang_big +cp -R data/lang data/lang_big +lm=$(cat data/srilm_interp/perplexities.txt | head -n1 | awk '{print $1}') +local/arpa2G.sh $lm data/lang_big data/lang_big + +# for really big lm, we should only decode using small LM +# and resocre using the big lm +utils/build_const_arpa_lm.sh $lm data/lang_big data/lang_big +exit 0; diff --git a/egs/iban/s5/local/score.sh b/egs/iban/s5/local/score.sh new file mode 120000 index 00000000000..0afefc3158c --- /dev/null +++ b/egs/iban/s5/local/score.sh @@ -0,0 +1 @@ +../steps/score_kaldi.sh \ No newline at end of file diff --git a/egs/iban/s5/local/train_lms_srilm.sh b/egs/iban/s5/local/train_lms_srilm.sh new file mode 100755 index 00000000000..9ed88842650 --- /dev/null +++ b/egs/iban/s5/local/train_lms_srilm.sh @@ -0,0 +1,230 @@ +#!/bin/bash +export LC_ALL=C + +words_file= +train_text= +dev_text= +oov_symbol="" + +echo "$0 $@" + +[ -f path.sh ] && . ./path.sh +. ./utils/parse_options.sh || exit 1 + +echo "-------------------------------------" +echo "Building an SRILM language model " +echo "-------------------------------------" + +if [ $# -ne 2 ] ; then + echo "Incorrect number of parameters. " + echo "Script has to be called like this:" + echo " $0 [switches] " + echo "For example: " + echo " $0 data data/srilm" + echo "The allowed switches are: " + echo " words_file= word list file -- data/lang/words.txt by default" + echo " train_text= data/train/text is used in case when not specified" + echo " dev_text= last 10 % of the train text is used by default" + echo " oov_symbol=> symbol to use for oov modeling -- by default" + exit 1 +fi + +datadir=$1 +tgtdir=$2 +outlm=lm.gz + + +##End of configuration +loc=`which ngram-count`; +if [ -z $loc ]; then + if uname -a | grep 64 >/dev/null; then # some kind of 64 bit... + sdir=`pwd`/../../../tools/srilm/bin/i686-m64 + else + sdir=`pwd`/../../../tools/srilm/bin/i686 + fi + if [ -f $sdir/ngram-count ]; then + echo Using SRILM tools from $sdir + export PATH=$PATH:$sdir + else + echo You appear to not have SRILM tools installed, either on your path, + echo or installed in $sdir. See tools/install_srilm.sh for installation + echo instructions. + exit 1 + fi +fi + +# Prepare the destination directory +mkdir -p $tgtdir + +for f in $words_file $train_text $dev_text; do + [ ! -s $f ] && echo "No such file $f" && exit 1; +done + +[ -z $words_file ] && words_file=$datadir/lang/words.txt +if [ ! -z "$train_text" ] && [ -z "$dev_text" ] ; then + nr=`cat $train_text | wc -l` + nr_dev=$(($nr / 10 )) + nr_train=$(( $nr - $nr_dev )) + orig_train_text=$train_text + head -n $nr_train $train_text > $tgtdir/train_text + tail -n $nr_dev $train_text > $tgtdir/dev_text + + train_text=$tgtdir/train_text + dev_text=$tgtdir/dev_text + echo "Using words file: $words_file" + echo "Using train text: 9/10 of $orig_train_text" + echo "Using dev text : 1/10 of $orig_train_text" +elif [ ! -z "$train_text" ] && [ ! -z "$dev_text" ] ; then + echo "Using words file: $words_file" + echo "Using train text: $train_text" + echo "Using dev text : $dev_text" + train_text=$train_text + dev_text=$dev_text +else + train_text=$datadir/train/text + dev_text=$datadir/dev2h/text + echo "Using words file: $words_file" + echo "Using train text: $train_text" + echo "Using dev text : $dev_text" +fi + + + +# Extract the word list from the training dictionary; exclude special symbols +sort $words_file | awk '{print $1}' | grep -v '\#0' | grep -v '' | grep -v -F "$oov_symbol" > $tgtdir/vocab +if (($?)); then + echo "Failed to create vocab from $words_file" + exit 1 +else + # wc vocab # doesn't work due to some encoding issues + echo vocab contains `cat $tgtdir/vocab | perl -ne 'BEGIN{$l=$w=0;}{split; $w+=$#_; $w++; $l++;}END{print "$l lines, $w words\n";}'` +fi + +# Kaldi transcript files contain Utterance_ID as the first word; remove it +cat $train_text | cut -f2- -d' ' > $tgtdir/train.txt +if (($?)); then + echo "Failed to create $tgtdir/train.txt from $train_text" + exit 1 +else + echo "Removed first word (uid) from every line of $train_text" + # wc text.train train.txt # doesn't work due to some encoding issues + echo $train_text contains `cat $train_text | perl -ane 'BEGIN{$w=$s=0;}{$w+=@F; $w--; $s++;}END{print "$w words, $s sentences\n";}'` + echo train.txt contains `cat $tgtdir/train.txt | perl -ane 'BEGIN{$w=$s=0;}{$w+=@F; $s++;}END{print "$w words, $s sentences\n";}'` +fi + +# Kaldi transcript files contain Utterance_ID as the first word; remove it +cat $dev_text | cut -f2- -d' ' > $tgtdir/dev.txt +if (($?)); then + echo "Failed to create $tgtdir/dev.txt from $dev_text" + exit 1 +else + echo "Removed first word (uid) from every line of $dev_text" + # wc text.train train.txt # doesn't work due to some encoding issues + echo $dev_text contains `cat $dev_text | perl -ane 'BEGIN{$w=$s=0;}{$w+=@F; $w--; $s++;}END{print "$w words, $s sentences\n";}'` + echo $tgtdir/dev.txt contains `cat $tgtdir/dev.txt | perl -ane 'BEGIN{$w=$s=0;}{$w+=@F; $s++;}END{print "$w words, $s sentences\n";}'` +fi + +echo "-------------------" +echo "Good-Turing 2grams" +echo "-------------------" +ngram-count -lm $tgtdir/2gram.gt01.gz -gt1min 0 -gt2min 1 -order 2 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol" +ngram-count -lm $tgtdir/2gram.gt02.gz -gt1min 0 -gt2min 2 -order 2 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol" + +echo "-------------------" +echo "Kneser-Ney 2grams" +echo "-------------------" +ngram-count -lm $tgtdir/2gram.kn01.gz -kndiscount1 -gt1min 0 -kndiscount2 -gt2min 1 -order 2 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol" +ngram-count -lm $tgtdir/2gram.kn02.gz -kndiscount1 -gt1min 0 -kndiscount2 -gt2min 2 -order 2 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol" + +echo "-------------------" +echo "Good-Turing 3grams" +echo "-------------------" +ngram-count -lm $tgtdir/3gram.gt011.gz -gt1min 0 -gt2min 1 -gt3min 1 -order 3 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol" +ngram-count -lm $tgtdir/3gram.gt012.gz -gt1min 0 -gt2min 1 -gt3min 2 -order 3 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol" +ngram-count -lm $tgtdir/3gram.gt022.gz -gt1min 0 -gt2min 2 -gt3min 2 -order 3 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol" +ngram-count -lm $tgtdir/3gram.gt023.gz -gt1min 0 -gt2min 2 -gt3min 3 -order 3 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol" + +echo "-------------------" +echo "Kneser-Ney 3grams" +echo "-------------------" +ngram-count -lm $tgtdir/3gram.kn011.gz -kndiscount1 -gt1min 0 -kndiscount2 -gt2min 1 -kndiscount3 -gt3min 1 -order 3 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol" +ngram-count -lm $tgtdir/3gram.kn012.gz -kndiscount1 -gt1min 0 -kndiscount2 -gt2min 1 -kndiscount3 -gt3min 2 -order 3 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol" +ngram-count -lm $tgtdir/3gram.kn022.gz -kndiscount1 -gt1min 0 -kndiscount2 -gt2min 2 -kndiscount3 -gt3min 2 -order 3 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol" +ngram-count -lm $tgtdir/3gram.kn023.gz -kndiscount1 -gt1min 0 -kndiscount2 -gt2min 2 -kndiscount3 -gt3min 3 -order 3 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol" + + +echo "-------------------" +echo "Good-Turing 4grams" +echo "-------------------" +ngram-count -lm $tgtdir/4gram.gt0111.gz -gt1min 0 -gt2min 1 -gt3min 1 -gt4min 1 -order 4 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol" +ngram-count -lm $tgtdir/4gram.gt0112.gz -gt1min 0 -gt2min 1 -gt3min 1 -gt4min 2 -order 4 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol" +ngram-count -lm $tgtdir/4gram.gt0122.gz -gt1min 0 -gt2min 1 -gt3min 2 -gt4min 2 -order 4 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol" +ngram-count -lm $tgtdir/4gram.gt0123.gz -gt1min 0 -gt2min 1 -gt3min 2 -gt4min 3 -order 4 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol" +ngram-count -lm $tgtdir/4gram.gt0113.gz -gt1min 0 -gt2min 1 -gt3min 1 -gt4min 3 -order 4 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol" +ngram-count -lm $tgtdir/4gram.gt0222.gz -gt1min 0 -gt2min 2 -gt3min 2 -gt4min 2 -order 4 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol" +ngram-count -lm $tgtdir/4gram.gt0223.gz -gt1min 0 -gt2min 2 -gt3min 2 -gt4min 3 -order 4 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol" + +echo "-------------------" +echo "Kneser-Ney 4grams" +echo "-------------------" +ngram-count -lm $tgtdir/4gram.kn0111.gz -kndiscount1 -gt1min 0 -kndiscount2 -gt2min 1 -kndiscount3 -gt3min 1 -kndiscount4 -gt4min 1 -order 4 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol" +ngram-count -lm $tgtdir/4gram.kn0112.gz -kndiscount1 -gt1min 0 -kndiscount2 -gt2min 1 -kndiscount3 -gt3min 1 -kndiscount4 -gt4min 2 -order 4 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol" +ngram-count -lm $tgtdir/4gram.kn0113.gz -kndiscount1 -gt1min 0 -kndiscount2 -gt2min 1 -kndiscount3 -gt3min 1 -kndiscount4 -gt4min 3 -order 4 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol" +ngram-count -lm $tgtdir/4gram.kn0122.gz -kndiscount1 -gt1min 0 -kndiscount2 -gt2min 1 -kndiscount3 -gt3min 2 -kndiscount4 -gt4min 2 -order 4 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol" +ngram-count -lm $tgtdir/4gram.kn0123.gz -kndiscount1 -gt1min 0 -kndiscount2 -gt2min 1 -kndiscount3 -gt3min 2 -kndiscount4 -gt4min 3 -order 4 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol" +ngram-count -lm $tgtdir/4gram.kn0222.gz -kndiscount1 -gt1min 0 -kndiscount2 -gt2min 2 -kndiscount3 -gt3min 2 -kndiscount4 -gt4min 2 -order 4 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol" +ngram-count -lm $tgtdir/4gram.kn0223.gz -kndiscount1 -gt1min 0 -kndiscount2 -gt2min 2 -kndiscount3 -gt3min 2 -kndiscount4 -gt4min 3 -order 4 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol" + +if [ ! -z ${LIBLBFGS} ]; then + #please not that if the switch -map-unk "$oov_symbol" is used with -maxent-convert-to-arpa, ngram-count will segfault + #instead of that, we simply output the model in the maxent format and convert it using the "ngram" + echo "-------------------" + echo "Maxent 2grams" + echo "-------------------" + sed 's/'${oov_symbol}'//g' $tgtdir/train.txt | \ + ngram-count -lm - -order 2 -text - -vocab $tgtdir/vocab -unk -sort -maxent -maxent-convert-to-arpa|\ + sed 's//'${oov_symbol}'/g' | gzip -c > $tgtdir/2gram.me.gz || exit 1 + + echo "-------------------" + echo "Maxent 3grams" + echo "-------------------" + sed 's/'${oov_symbol}'//g' $tgtdir/train.txt | \ + ngram-count -lm - -order 3 -text - -vocab $tgtdir/vocab -unk -sort -maxent -maxent-convert-to-arpa|\ + sed 's//'${oov_symbol}'/g' | gzip -c > $tgtdir/3gram.me.gz || exit 1 + + echo "-------------------" + echo "Maxent 4grams" + echo "-------------------" + sed 's/'${oov_symbol}'//g' $tgtdir/train.txt | \ + ngram-count -lm - -order 4 -text - -vocab $tgtdir/vocab -unk -sort -maxent -maxent-convert-to-arpa|\ + sed 's//'${oov_symbol}'/g' | gzip -c > $tgtdir/4gram.me.gz || exit 1 + +fi + + +echo "--------------------" +echo "Computing perplexity" +echo "--------------------" +( + for f in $tgtdir/2gram* ; do ( echo $f; ngram -order 2 -lm $f -unk -map-unk "$oov_symbol" -ppl $tgtdir/dev.txt ) | paste -s -d ' ' ; done + for f in $tgtdir/3gram* ; do ( echo $f; ngram -order 3 -lm $f -unk -map-unk "$oov_symbol" -ppl $tgtdir/dev.txt ) | paste -s -d ' ' ; done + for f in $tgtdir/4gram* ; do ( echo $f; ngram -order 4 -lm $f -unk -map-unk "$oov_symbol" -ppl $tgtdir/dev.txt ) | paste -s -d ' ' ; done +) | sort -r -n -k 15,15g | column -t | tee $tgtdir/perplexities.txt + +echo "The perlexity scores report is stored in $tgtdir/perplexities.txt " + +#This will link the lowest perplexity LM as the output LM. +#ln -sf $tgtdir/`head -n 1 $tgtdir/perplexities.txt | cut -f 1 -d ' '` $outlm + +#A slight modification of the previous approach: +#We look at the two lowest perplexity LMs and use a 3gram LM if one of the two, even if the 4gram is of lower ppl +nof_trigram_lm=`head -n 2 $tgtdir/perplexities.txt | grep 3gram | wc -l` +if [[ $nof_trigram_lm -eq 0 ]] ; then + lmfilename=`head -n 1 $tgtdir/perplexities.txt | cut -f 1 -d ' '` +elif [[ $nof_trigram_lm -eq 2 ]] ; then + lmfilename=`head -n 1 $tgtdir/perplexities.txt | cut -f 1 -d ' '` +else #exactly one 3gram LM + lmfilename=`head -n 2 $tgtdir/perplexities.txt | grep 3gram | cut -f 1 -d ' '` +fi +(cd $tgtdir; ln -sf `basename $lmfilename` $outlm ) + diff --git a/egs/iban/s5/path.sh b/egs/iban/s5/path.sh new file mode 100755 index 00000000000..2d17b17a84a --- /dev/null +++ b/egs/iban/s5/path.sh @@ -0,0 +1,6 @@ +export KALDI_ROOT=`pwd`/../../.. +[ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh +export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH +[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1 +. $KALDI_ROOT/tools/config/common_path.sh +export LC_ALL=C diff --git a/egs/iban/s5/run.sh b/egs/iban/s5/run.sh new file mode 100755 index 00000000000..b184a79e45e --- /dev/null +++ b/egs/iban/s5/run.sh @@ -0,0 +1,196 @@ +#!/bin/bash + +# Copyright 2015 Sarah Samson Juan +# Apache 2.0 + +# This script prepares data and train/decode ASR. +# Download the Iban corpus from github. wav files are in data/wav, +# language model in LM/*.arpa.tar.gz and lexicon in lang/dict. + +stage=0 + +# initialization PATH +. ./path.sh || die "path.sh expected"; +# initialization commands +. ./cmd.sh +. ./utils/parse_options.sh + +set -e -o pipefail +corpus=./corpus +# download iban to build ASR +if [ ! -f "$corpus/README" ]; then + #available from github + mkdir -p ./$corpus/ + [ ! -f ./iban.tar.gz ] && wget http://www.openslr.org/resources/24/iban.tar.gz + tar xzf iban.tar.gz -C $corpus +fi + +nj=16 +dev_nj=6 + +if [ $stage -le 1 ]; then + echo "Preparing data and training language models" + local/prepare_data.sh $corpus/ + local/prepare_dict.sh $corpus/ + utils/prepare_lang.sh data/local/dict "" data/local/lang data/lang + local/prepare_lm.sh +fi + + +if [ $stage -le 2 ]; then + # Feature extraction + for x in train dev; do + steps/make_mfcc.sh --nj $nj --cmd "$train_cmd" data/$x exp/make_mfcc/$x mfcc + steps/compute_cmvn_stats.sh data/$x exp/make_mfcc/$x mfcc + done +fi + +if [ $stage -le 3 ]; then + ### Monophone + echo "Starting monophone training." + utils/subset_data_dir.sh data/train 1000 data/train.1k + steps/train_mono.sh --nj $nj --cmd "$train_cmd" data/train.1k data/lang exp/mono + echo "Mono training done." + + ( + echo "Decoding the dev set using monophone models." + utils/mkgraph.sh --mono data/lang_test exp/mono exp/mono/graph + + steps/decode.sh --config conf/decode.config --nj $dev_nj --cmd "$decode_cmd" \ + exp/mono/graph data/dev exp/mono/decode_dev + echo "Monophone decoding done." + ) & +fi + + +if [ $stage -le 4 ]; then + ### Triphone + echo "Starting triphone training." + steps/align_si.sh --nj $nj --cmd "$train_cmd" \ + data/train data/lang exp/mono exp/mono_ali + steps/train_deltas.sh --boost-silence 1.25 --cmd "$train_cmd" \ + 3200 30000 data/train data/lang exp/mono_ali exp/tri1 + echo "Triphone training done." + + ( + echo "Decoding the dev set using triphone models." + utils/mkgraph.sh data/lang_test exp/tri1 exp/tri1/graph + steps/decode.sh --nj $dev_nj --cmd "$decode_cmd" \ + exp/tri1/graph data/dev exp/tri1/decode_dev + + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_test/ data/lang_big/ data/dev \ + exp/tri1/decode_dev exp/tri1/decode_dev.rescored + echo "Triphone decoding done." + ) & +fi + +if [ $stage -le 5 ]; then + ## Triphones + delta delta + # Training + echo "Starting (larger) triphone training." + steps/align_si.sh --nj $nj --cmd "$train_cmd" --use-graphs true \ + data/train data/lang exp/tri1 exp/tri1_ali + steps/train_deltas.sh --cmd "$train_cmd" \ + 4200 40000 data/train data/lang exp/tri1_ali exp/tri2a + echo "Triphone (large) training done." + + ( + echo "Decoding the dev set using triphone(large) models." + utils/mkgraph.sh data/lang_test exp/tri2a exp/tri2a/graph + steps/decode.sh --nj $dev_nj --cmd "$decode_cmd" \ + exp/tri2a/graph data/dev exp/tri2a/decode_dev + + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_test/ data/lang_big/ data/dev \ + exp/tri2a/decode_dev exp/tri2a/decode_dev.rescored + echo "Triphone(large) decoding done." + ) & +fi + +if [ $stage -le 6 ]; then + ### Triphone + LDA and MLLT + # Training + echo "Starting LDA+MLLT training." + steps/align_si.sh --nj $nj --cmd "$train_cmd" \ + data/train data/lang exp/tri2a exp/tri2a_ali + + steps/train_lda_mllt.sh --cmd "$train_cmd" \ + --splice-opts "--left-context=3 --right-context=3" \ + 4200 40000 data/train data/lang exp/tri2a_ali exp/tri2b + echo "LDA+MLLT training done." + + ( + echo "Decoding the dev set using LDA+MLLT models." + utils/mkgraph.sh data/lang_test exp/tri2b exp/tri2b/graph + steps/decode.sh --nj $dev_nj --cmd "$decode_cmd" \ + exp/tri2b/graph data/dev exp/tri2b/decode_dev + + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_test/ data/lang_big/ data/dev \ + exp/tri2b/decode_dev exp/tri2b/decode_dev.rescored + echo "LDA+MLLT decoding done." + ) & +fi + + +if [ $stage -le 7 ]; then + ### Triphone + LDA and MLLT + SAT and FMLLR + # Training + echo "Starting SAT+FMLLR training." + steps/align_si.sh --nj $nj --cmd "$train_cmd" \ + --use-graphs true data/train data/lang exp/tri2b exp/tri2b_ali + steps/train_sat.sh --cmd "$train_cmd" 4200 40000 \ + data/train data/lang exp/tri2b_ali exp/tri3b + echo "SAT+FMLLR training done." + + ( + echo "Decoding the dev set using SAT+FMLLR models." + utils/mkgraph.sh data/lang_test exp/tri3b exp/tri3b/graph + steps/decode_fmllr.sh --nj $dev_nj --cmd "$decode_cmd" \ + exp/tri3b/graph data/dev exp/tri3b/decode_dev + + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_test/ data/lang_big/ data/dev \ + exp/tri3b/decode_dev exp/tri3b/decode_dev.rescored + echo "SAT+FMLLR decoding done." + ) & +fi + + +if [ $stage -le 8 ]; then + echo "Starting SGMM training." + steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \ + data/train data/lang exp/tri3b exp/tri3b_ali + + steps/train_ubm.sh --cmd "$train_cmd" \ + 600 data/train data/lang exp/tri3b_ali exp/ubm5b2 + + steps/train_sgmm2.sh --cmd "$train_cmd" \ + 5200 12000 data/train data/lang exp/tri3b_ali exp/ubm5b2/final.ubm exp/sgmm2_5b2 + echo "SGMM training done." + + ( + echo "Decoding the dev set using SGMM models" + # Graph compilation + utils/mkgraph.sh data/lang_test exp/sgmm2_5b2 exp/sgmm2_5b2/graph + utils/mkgraph.sh data/lang_big/ exp/sgmm2_5b2 exp/sgmm2_5b2/graph_big + + steps/decode_sgmm2.sh --nj $dev_nj --cmd "$decode_cmd" \ + --transform-dir exp/tri3b/decode_dev \ + exp/sgmm2_5b2/graph data/dev exp/sgmm2_5b2/decode_dev + + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_test/ data/lang_big/ data/dev \ + exp/sgmm2_5b2/decode_dev exp/sgmm2_5b2/decode_dev.rescored + + steps/decode_sgmm2.sh --nj $dev_nj --cmd "$decode_cmd" \ + --transform-dir exp/tri3b/decode_dev \ + exp/sgmm2_5b2/graph_big data/dev exp/sgmm2_5b2/decode_dev.big + echo "SGMM decoding done." + ) & +fi + +wait; +#score +for x in exp/*/decode*; do [ -d $x ] && grep WER $x/wer_* | utils/best_wer.sh; done diff --git a/egs/iban/s5/steps b/egs/iban/s5/steps new file mode 120000 index 00000000000..6e99bf5b5ad --- /dev/null +++ b/egs/iban/s5/steps @@ -0,0 +1 @@ +../../wsj/s5/steps \ No newline at end of file diff --git a/egs/iban/s5/utils b/egs/iban/s5/utils new file mode 120000 index 00000000000..b240885218f --- /dev/null +++ b/egs/iban/s5/utils @@ -0,0 +1 @@ +../../wsj/s5/utils \ No newline at end of file diff --git a/egs/librispeech/s5/RESULTS b/egs/librispeech/s5/RESULTS index 885b8bcd9f3..ca3806bd168 100644 --- a/egs/librispeech/s5/RESULTS +++ b/egs/librispeech/s5/RESULTS @@ -164,37 +164,67 @@ %WER 17.16 [ 8982 / 52343, 855 ins, 1421 del, 6706 sub ] exp/nnet2_online/nnet_a_online/decode_test_other_tgmed/wer_12 %WER 18.90 [ 9891 / 52343, 798 ins, 1786 del, 7307 sub ] exp/nnet2_online/nnet_a_online/decode_test_other_tgsmall/wer_13 -# RNNLM rescoring of tri6b - -%WER 7.50 [ 4080 / 54402, 617 ins, 416 del, 3047 sub ] exp/tri6b/decode_tglarge_dev_clean/wer_14 -%WER 7.09 [ 3859 / 54402, 611 ins, 354 del, 2894 sub ] exp/tri6b/decode_tglarge_dev_clean_rnnlm_h150_me5-1000_L0.25/wer_14 -%WER 7.29 [ 3968 / 54402, 661 ins, 332 del, 2975 sub ] exp/tri6b/decode_tglarge_dev_clean_rnnlm_h150_me5-1000_L0.5/wer_13 -%WER 7.73 [ 4205 / 54402, 709 ins, 349 del, 3147 sub ] exp/tri6b/decode_tglarge_dev_clean_rnnlm_h150_me5-1000_L0.75/wer_12 - -%WER 21.94 [ 11180 / 50948, 1264 ins, 1506 del, 8410 sub ] exp/tri6b/decode_tglarge_dev_other/wer_16 -%WER 21.36 [ 10881 / 50948, 1309 ins, 1362 del, 8210 sub ] exp/tri6b/decode_tglarge_dev_other_rnnlm_h150_me5-1000_L0.25/wer_16 -%WER 21.29 [ 10848 / 50948, 1330 ins, 1324 del, 8194 sub ] exp/tri6b/decode_tglarge_dev_other_rnnlm_h150_me5-1000_L0.5/wer_16 -%WER 21.75 [ 11082 / 50948, 1351 ins, 1346 del, 8385 sub ] exp/tri6b/decode_tglarge_dev_other_rnnlm_h150_me5-1000_L0.75/wer_17 - -%WER 9.39 [ 5106 / 54402, 597 ins, 648 del, 3861 sub ] exp/tri6b/decode_tgmed_dev_clean/wer_14 -%WER 8.09 [ 4400 / 54402, 564 ins, 517 del, 3319 sub ] exp/tri6b/decode_tgmed_dev_clean_rnnlm_h150_me5-1000_L0.25/wer_15 -%WER 8.00 [ 4350 / 54402, 609 ins, 472 del, 3269 sub ] exp/tri6b/decode_tgmed_dev_clean_rnnlm_h150_me5-1000_L0.5/wer_15 -%WER 8.21 [ 4467 / 54402, 692 ins, 415 del, 3360 sub ] exp/tri6b/decode_tgmed_dev_clean_rnnlm_h150_me5-1000_L0.75/wer_12 - -%WER 25.16 [ 12816 / 50948, 1175 ins, 2076 del, 9565 sub ] exp/tri6b/decode_tgmed_dev_other/wer_16 -%WER 23.28 [ 11861 / 50948, 1289 ins, 1546 del, 9026 sub ] exp/tri6b/decode_tgmed_dev_other_rnnlm_h150_me5-1000_L0.25/wer_14 -%WER 23.03 [ 11732 / 50948, 1341 ins, 1467 del, 8924 sub ] exp/tri6b/decode_tgmed_dev_other_rnnlm_h150_me5-1000_L0.5/wer_14 -%WER 23.12 [ 11779 / 50948, 1351 ins, 1476 del, 8952 sub ] exp/tri6b/decode_tgmed_dev_other_rnnlm_h150_me5-1000_L0.75/wer_15 - -%WER 10.66 [ 5800 / 54402, 558 ins, 854 del, 4388 sub ] exp/tri6b/decode_tgsmall_dev_clean/wer_15 -%WER 8.78 [ 4779 / 54402, 586 ins, 588 del, 3605 sub ] exp/tri6b/decode_tgsmall_dev_clean_rnnlm_h150_me5-1000_L0.25/wer_14 -%WER 8.50 [ 4624 / 54402, 661 ins, 505 del, 3458 sub ] exp/tri6b/decode_tgsmall_dev_clean_rnnlm_h150_me5-1000_L0.5/wer_13 -%WER 8.56 [ 4659 / 54402, 674 ins, 485 del, 3500 sub ] exp/tri6b/decode_tgsmall_dev_clean_rnnlm_h150_me5-1000_L0.75/wer_13 - -%WER 27.18 [ 13850 / 50948, 1192 ins, 2340 del, 10318 sub ] exp/tri6b/decode_tgsmall_dev_other/wer_15 -%WER 24.72 [ 12596 / 50948, 1291 ins, 1803 del, 9502 sub ] exp/tri6b/decode_tgsmall_dev_other_rnnlm_h150_me5-1000_L0.25/wer_14 -%WER 24.18 [ 12317 / 50948, 1284 ins, 1732 del, 9301 sub ] exp/tri6b/decode_tgsmall_dev_other_rnnlm_h150_me5-1000_L0.5/wer_15 -%WER 24.19 [ 12323 / 50948, 1327 ins, 1686 del, 9310 sub ] exp/tri6b/decode_tgsmall_dev_other_rnnlm_h150_me5-1000_L0.75/wer_15 +# RNNLM rescoring of tri6b (faster-rnnlm hidden=150 direct=4.0Gb, Hierarchical Softmax) +%WER 7.39 [ 4023 / 54402, 540 ins, 444 del, 3039 sub ] exp/tri6b/decode_tglarge_dev_clean/wer_13_1.0 +%WER 7.03 [ 3823 / 54402, 608 ins, 343 del, 2872 sub ] exp/tri6b/decode_tglarge_dev_clean_faster-rnnlm_h150-me5-1000_L0.25/wer_13_0.5 +%WER 7.03 [ 3827 / 54402, 606 ins, 320 del, 2901 sub ] exp/tri6b/decode_tglarge_dev_clean_faster-rnnlm_h150-me5-1000_L0.5/wer_14_0.5 +%WER 7.25 [ 3946 / 54402, 564 ins, 368 del, 3014 sub ] exp/tri6b/decode_tglarge_dev_clean_faster-rnnlm_h150-me5-1000_L0.75/wer_14_1.0 + +%WER 21.31 [ 10858 / 50948, 1525 ins, 1151 del, 8182 sub ] exp/tri6b/decode_tglarge_dev_other/wer_17_0.0 +%WER 20.62 [ 10504 / 50948, 1377 ins, 1180 del, 7947 sub ] exp/tri6b/decode_tglarge_dev_other_faster-rnnlm_h150-me5-1000_L0.25/wer_15_0.5 +%WER 20.64 [ 10515 / 50948, 1253 ins, 1313 del, 7949 sub ] exp/tri6b/decode_tglarge_dev_other_faster-rnnlm_h150-me5-1000_L0.5/wer_16_1.0 +%WER 20.91 [ 10652 / 50948, 1344 ins, 1233 del, 8075 sub ] exp/tri6b/decode_tglarge_dev_other_faster-rnnlm_h150-me5-1000_L0.75/wer_15_1.0 + +%WER 9.21 [ 5012 / 54402, 703 ins, 510 del, 3799 sub ] exp/tri6b/decode_tgmed_dev_clean/wer_14_0.0 +%WER 7.99 [ 4345 / 54402, 554 ins, 487 del, 3304 sub ] exp/tri6b/decode_tgmed_dev_clean_faster-rnnlm_h150-me5-1000_L0.25/wer_15_0.5 +%WER 7.68 [ 4177 / 54402, 596 ins, 414 del, 3167 sub ] exp/tri6b/decode_tgmed_dev_clean_faster-rnnlm_h150-me5-1000_L0.5/wer_14_0.5 +%WER 7.70 [ 4190 / 54402, 582 ins, 422 del, 3186 sub ] exp/tri6b/decode_tgmed_dev_clean_faster-rnnlm_h150-me5-1000_L0.75/wer_13_1.0 + +%WER 24.27 [ 12365 / 50948, 1365 ins, 1591 del, 9409 sub ] exp/tri6b/decode_tgmed_dev_other/wer_17_0.0 +%WER 22.51 [ 11468 / 50948, 1496 ins, 1235 del, 8737 sub ] exp/tri6b/decode_tgmed_dev_other_faster-rnnlm_h150-me5-1000_L0.25/wer_15_0.0 +%WER 22.11 [ 11267 / 50948, 1494 ins, 1163 del, 8610 sub ] exp/tri6b/decode_tgmed_dev_other_faster-rnnlm_h150-me5-1000_L0.5/wer_16_0.0 +%WER 22.10 [ 11262 / 50948, 1532 ins, 1131 del, 8599 sub ] exp/tri6b/decode_tgmed_dev_other_faster-rnnlm_h150-me5-1000_L0.75/wer_16_0.0 + +%WER 10.50 [ 5711 / 54402, 693 ins, 674 del, 4344 sub ] exp/tri6b/decode_tgsmall_dev_clean/wer_15_0.0 +%WER 8.53 [ 4641 / 54402, 582 ins, 555 del, 3504 sub ] exp/tri6b/decode_tgsmall_dev_clean_faster-rnnlm_h150-me5-1000_L0.25/wer_14_0.5 +%WER 8.09 [ 4400 / 54402, 605 ins, 469 del, 3326 sub ] exp/tri6b/decode_tgsmall_dev_clean_faster-rnnlm_h150-me5-1000_L0.5/wer_14_0.5 +%WER 8.02 [ 4363 / 54402, 594 ins, 460 del, 3309 sub ] exp/tri6b/decode_tgsmall_dev_clean_faster-rnnlm_h150-me5-1000_L0.75/wer_13_1.0 + +%WER 26.22 [ 13358 / 50948, 1330 ins, 1955 del, 10073 sub ] exp/tri6b/decode_tgsmall_dev_other/wer_17_0.0 +%WER 23.95 [ 12202 / 50948, 1523 ins, 1381 del, 9298 sub ] exp/tri6b/decode_tgsmall_dev_other_faster-rnnlm_h150-me5-1000_L0.25/wer_14_0.0 +%WER 23.22 [ 11828 / 50948, 1553 ins, 1247 del, 9028 sub ] exp/tri6b/decode_tgsmall_dev_other_faster-rnnlm_h150-me5-1000_L0.5/wer_14_0.0 +%WER 23.22 [ 11832 / 50948, 1435 ins, 1376 del, 9021 sub ] exp/tri6b/decode_tgsmall_dev_other_faster-rnnlm_h150-me5-1000_L0.75/wer_15_0.5 + +# RNNLM rescoring of tri6b (faster-rnnlm hidden=150 direct=1.6Gb Noise contrastive Estimation) +%WER 7.39 [ 4023 / 54402, 540 ins, 444 del, 3039 sub ] exp/tri6b/decode_tglarge_dev_clean/wer_13_1.0 +%WER 7.05 [ 3835 / 54402, 487 ins, 447 del, 2901 sub ] exp/tri6b/decode_tglarge_dev_clean_faster-rnnlm_h150-me3-400-nce20_L0.25/wer_15_1.0 +%WER 6.84 [ 3723 / 54402, 524 ins, 394 del, 2805 sub ] exp/tri6b/decode_tglarge_dev_clean_faster-rnnlm_h150-me3-400-nce20_L0.5/wer_13_1.0 +%WER 6.92 [ 3766 / 54402, 564 ins, 376 del, 2826 sub ] exp/tri6b/decode_tglarge_dev_clean_faster-rnnlm_h150-me3-400-nce20_L0.75/wer_12_1.0 + +%WER 21.31 [ 10858 / 50948, 1525 ins, 1151 del, 8182 sub ] exp/tri6b/decode_tglarge_dev_other/wer_17_0.0 +%WER 20.90 [ 10648 / 50948, 1404 ins, 1227 del, 8017 sub ] exp/tri6b/decode_tglarge_dev_other_faster-rnnlm_h150-me3-400-nce20_L0.25/wer_15_0.5 +%WER 20.70 [ 10544 / 50948, 1271 ins, 1364 del, 7909 sub ] exp/tri6b/decode_tglarge_dev_other_faster-rnnlm_h150-me3-400-nce20_L0.5/wer_15_1.0 +%WER 20.82 [ 10605 / 50948, 1295 ins, 1347 del, 7963 sub ] exp/tri6b/decode_tglarge_dev_other_faster-rnnlm_h150-me3-400-nce20_L0.75/wer_15_1.0 + +%WER 9.21 [ 5012 / 54402, 703 ins, 510 del, 3799 sub ] exp/tri6b/decode_tgmed_dev_clean/wer_14_0.0 +%WER 8.01 [ 4360 / 54402, 669 ins, 402 del, 3289 sub ] exp/tri6b/decode_tgmed_dev_clean_faster-rnnlm_h150-me3-400-nce20_L0.25/wer_14_0.0 +%WER 7.46 [ 4056 / 54402, 584 ins, 422 del, 3050 sub ] exp/tri6b/decode_tgmed_dev_clean_faster-rnnlm_h150-me3-400-nce20_L0.5/wer_14_0.5 +%WER 7.28 [ 3962 / 54402, 536 ins, 451 del, 2975 sub ] exp/tri6b/decode_tgmed_dev_clean_faster-rnnlm_h150-me3-400-nce20_L0.75/wer_14_1.0 + +%WER 24.27 [ 12365 / 50948, 1365 ins, 1591 del, 9409 sub ] exp/tri6b/decode_tgmed_dev_other/wer_17_0.0 +%WER 22.82 [ 11628 / 50948, 1530 ins, 1244 del, 8854 sub ] exp/tri6b/decode_tgmed_dev_other_faster-rnnlm_h150-me3-400-nce20_L0.25/wer_15_0.0 +%WER 22.21 [ 11315 / 50948, 1554 ins, 1152 del, 8609 sub ] exp/tri6b/decode_tgmed_dev_other_faster-rnnlm_h150-me3-400-nce20_L0.5/wer_15_0.0 +%WER 22.01 [ 11213 / 50948, 1609 ins, 1086 del, 8518 sub ] exp/tri6b/decode_tgmed_dev_other_faster-rnnlm_h150-me3-400-nce20_L0.75/wer_15_0.0 + +%WER 10.50 [ 5711 / 54402, 693 ins, 674 del, 4344 sub ] exp/tri6b/decode_tgsmall_dev_clean/wer_15_0.0 +%WER 8.56 [ 4659 / 54402, 677 ins, 467 del, 3515 sub ] exp/tri6b/decode_tgsmall_dev_clean_faster-rnnlm_h150-me3-400-nce20_L0.25/wer_14_0.0 +%WER 7.81 [ 4250 / 54402, 657 ins, 387 del, 3206 sub ] exp/tri6b/decode_tgsmall_dev_clean_faster-rnnlm_h150-me3-400-nce20_L0.5/wer_14_0.0 +%WER 7.58 [ 4125 / 54402, 618 ins, 406 del, 3101 sub ] exp/tri6b/decode_tgsmall_dev_clean_faster-rnnlm_h150-me3-400-nce20_L0.75/wer_13_0.5 + +%WER 26.22 [ 13358 / 50948, 1330 ins, 1955 del, 10073 sub ] exp/tri6b/decode_tgsmall_dev_other/wer_17_0.0 +%WER 24.07 [ 12264 / 50948, 1482 ins, 1435 del, 9347 sub ] exp/tri6b/decode_tgsmall_dev_other_faster-rnnlm_h150-me3-400-nce20_L0.25/wer_15_0.0 +%WER 23.15 [ 11797 / 50948, 1526 ins, 1276 del, 8995 sub ] exp/tri6b/decode_tgsmall_dev_other_faster-rnnlm_h150-me3-400-nce20_L0.5/wer_15_0.0 +%WER 22.92 [ 11677 / 50948, 1544 ins, 1241 del, 8892 sub ] exp/tri6b/decode_tgsmall_dev_other_faster-rnnlm_h150-me3-400-nce20_L0.75/wer_16_0.0 ## Multi-splice version of online recipe. # for x in exp/nnet2_online/nnet_ms_a/decode_*; do grep WER $x/wer_* | utils/best_wer.sh ; done @@ -225,6 +255,74 @@ %WER 18.23 [ 9288 / 50948, 782 ins, 1585 del, 6921 sub ] exp/nnet2_online/nnet_ms_a_online/decode_dev_other_tgsmall_utt/wer_15 %WER 17.54 [ 8936 / 50948, 813 ins, 1425 del, 6698 sub ] exp/nnet2_online/nnet_ms_a_online/decode_dev_other_tgsmall_utt_offline/wer_14 +## Multi-splice version of online recipe (5/16/2016). +# for x in exp/nnet2_online/nnet_ms_a/decode_*; do grep WER $x/wer_* | utils/best_wer.sh ; done +%WER 4.46 [ 2429 / 54402, 311 ins, 284 del, 1834 sub ] exp/nnet2_online/nnet_ms_a/decode_dev_clean_fglarge/wer_13_1.0 +%WER 4.64 [ 2522 / 54402, 362 ins, 251 del, 1909 sub ] exp/nnet2_online/nnet_ms_a/decode_dev_clean_tglarge/wer_12_0.5 +%WER 5.86 [ 3187 / 54402, 400 ins, 357 del, 2430 sub ] exp/nnet2_online/nnet_ms_a/decode_dev_clean_tgmed/wer_13_0.0 +%WER 6.60 [ 3592 / 54402, 450 ins, 403 del, 2739 sub ] exp/nnet2_online/nnet_ms_a/decode_dev_clean_tgsmall/wer_12_0.0 +%WER 12.31 [ 6274 / 50948, 742 ins, 784 del, 4748 sub ] exp/nnet2_online/nnet_ms_a/decode_dev_other_fglarge/wer_16_0.5 +%WER 12.87 [ 6557 / 50948, 774 ins, 850 del, 4933 sub ] exp/nnet2_online/nnet_ms_a/decode_dev_other_tglarge/wer_15_0.5 +%WER 15.25 [ 7770 / 50948, 871 ins, 1074 del, 5825 sub ] exp/nnet2_online/nnet_ms_a/decode_dev_other_tgmed/wer_16_0.0 +%WER 16.55 [ 8434 / 50948, 832 ins, 1280 del, 6322 sub ] exp/nnet2_online/nnet_ms_a/decode_dev_other_tgsmall/wer_16_0.0 +%WER 4.99 [ 2624 / 52576, 388 ins, 256 del, 1980 sub ] exp/nnet2_online/nnet_ms_a/decode_test_clean_fglarge/wer_13_0.5 +%WER 5.15 [ 2709 / 52576, 386 ins, 284 del, 2039 sub ] exp/nnet2_online/nnet_ms_a/decode_test_clean_tglarge/wer_13_0.5 +%WER 6.25 [ 3285 / 52576, 422 ins, 357 del, 2506 sub ] exp/nnet2_online/nnet_ms_a/decode_test_clean_tgmed/wer_13_0.0 +%WER 7.07 [ 3717 / 52576, 455 ins, 456 del, 2806 sub ] exp/nnet2_online/nnet_ms_a/decode_test_clean_tgsmall/wer_13_0.0 +%WER 12.89 [ 6748 / 52343, 878 ins, 769 del, 5101 sub ] exp/nnet2_online/nnet_ms_a/decode_test_other_fglarge/wer_16_0.0 +%WER 13.32 [ 6972 / 52343, 940 ins, 770 del, 5262 sub ] exp/nnet2_online/nnet_ms_a/decode_test_other_tglarge/wer_14_0.0 +%WER 15.82 [ 8281 / 52343, 886 ins, 1197 del, 6198 sub ] exp/nnet2_online/nnet_ms_a/decode_test_other_tgmed/wer_15_0.0 +%WER 17.09 [ 8948 / 52343, 863 ins, 1383 del, 6702 sub ] exp/nnet2_online/nnet_ms_a/decode_test_other_tgsmall/wer_15_0.0 + +# for x in exp/nnet2_online/nnet_ms_a_online/decode_*; do grep WER $x/wer_* | utils/best_wer.sh ; done +%WER 4.53 [ 2466 / 54402, 318 ins, 295 del, 1853 sub ] exp/nnet2_online/nnet_ms_a_online/decode_dev_clean_fglarge/wer_14_1.0 +%WER 4.76 [ 2592 / 54402, 338 ins, 286 del, 1968 sub ] exp/nnet2_online/nnet_ms_a_online/decode_dev_clean_fglarge_utt/wer_13_1.0 +%WER 4.57 [ 2488 / 54402, 330 ins, 285 del, 1873 sub ] exp/nnet2_online/nnet_ms_a_online/decode_dev_clean_fglarge_utt_offline/wer_13_1.0 +%WER 4.71 [ 2562 / 54402, 392 ins, 236 del, 1934 sub ] exp/nnet2_online/nnet_ms_a_online/decode_dev_clean_tglarge/wer_14_0.0 +%WER 4.90 [ 2665 / 54402, 352 ins, 280 del, 2033 sub ] exp/nnet2_online/nnet_ms_a_online/decode_dev_clean_tglarge_utt/wer_14_0.5 +%WER 4.72 [ 2570 / 54402, 357 ins, 273 del, 1940 sub ] exp/nnet2_online/nnet_ms_a_online/decode_dev_clean_tglarge_utt_offline/wer_14_0.5 +%WER 5.87 [ 3196 / 54402, 419 ins, 340 del, 2437 sub ] exp/nnet2_online/nnet_ms_a_online/decode_dev_clean_tgmed/wer_12_0.0 +%WER 6.11 [ 3326 / 54402, 385 ins, 396 del, 2545 sub ] exp/nnet2_online/nnet_ms_a_online/decode_dev_clean_tgmed_utt/wer_12_0.5 +%WER 5.99 [ 3258 / 54402, 382 ins, 392 del, 2484 sub ] exp/nnet2_online/nnet_ms_a_online/decode_dev_clean_tgmed_utt_offline/wer_12_0.5 +%WER 6.58 [ 3581 / 54402, 472 ins, 379 del, 2730 sub ] exp/nnet2_online/nnet_ms_a_online/decode_dev_clean_tgsmall/wer_11_0.0 +%WER 6.89 [ 3746 / 54402, 475 ins, 405 del, 2866 sub ] exp/nnet2_online/nnet_ms_a_online/decode_dev_clean_tgsmall_utt/wer_12_0.0 +%WER 6.69 [ 3637 / 54402, 480 ins, 383 del, 2774 sub ] exp/nnet2_online/nnet_ms_a_online/decode_dev_clean_tgsmall_utt_offline/wer_11_0.0 +%WER 12.67 [ 6456 / 50948, 774 ins, 771 del, 4911 sub ] exp/nnet2_online/nnet_ms_a_online/decode_dev_other_fglarge/wer_16_0.5 +%WER 13.73 [ 6993 / 50948, 785 ins, 922 del, 5286 sub ] exp/nnet2_online/nnet_ms_a_online/decode_dev_other_fglarge_utt/wer_14_1.0 +%WER 12.97 [ 6609 / 50948, 797 ins, 801 del, 5011 sub ] exp/nnet2_online/nnet_ms_a_online/decode_dev_other_fglarge_utt_offline/wer_16_0.5 +%WER 13.09 [ 6670 / 50948, 800 ins, 826 del, 5044 sub ] exp/nnet2_online/nnet_ms_a_online/decode_dev_other_tglarge/wer_15_0.5 +%WER 14.27 [ 7270 / 50948, 909 ins, 869 del, 5492 sub ] exp/nnet2_online/nnet_ms_a_online/decode_dev_other_tglarge_utt/wer_14_0.5 +%WER 13.46 [ 6859 / 50948, 828 ins, 845 del, 5186 sub ] exp/nnet2_online/nnet_ms_a_online/decode_dev_other_tglarge_utt_offline/wer_15_0.5 +%WER 15.27 [ 7782 / 50948, 874 ins, 1051 del, 5857 sub ] exp/nnet2_online/nnet_ms_a_online/decode_dev_other_tgmed/wer_16_0.0 +%WER 16.41 [ 8359 / 50948, 949 ins, 1135 del, 6275 sub ] exp/nnet2_online/nnet_ms_a_online/decode_dev_other_tgmed_utt/wer_16_0.0 +%WER 15.56 [ 7926 / 50948, 893 ins, 1051 del, 5982 sub ] exp/nnet2_online/nnet_ms_a_online/decode_dev_other_tgmed_utt_offline/wer_16_0.0 +%WER 16.49 [ 8402 / 50948, 855 ins, 1210 del, 6337 sub ] exp/nnet2_online/nnet_ms_a_online/decode_dev_other_tgsmall/wer_15_0.0 +%WER 17.80 [ 9068 / 50948, 969 ins, 1260 del, 6839 sub ] exp/nnet2_online/nnet_ms_a_online/decode_dev_other_tgsmall_utt/wer_15_0.0 +%WER 16.97 [ 8647 / 50948, 845 ins, 1324 del, 6478 sub ] exp/nnet2_online/nnet_ms_a_online/decode_dev_other_tgsmall_utt_offline/wer_17_0.0 +%WER 5.05 [ 2654 / 52576, 411 ins, 239 del, 2004 sub ] exp/nnet2_online/nnet_ms_a_online/decode_test_clean_fglarge/wer_12_0.5 +%WER 5.24 [ 2755 / 52576, 365 ins, 312 del, 2078 sub ] exp/nnet2_online/nnet_ms_a_online/decode_test_clean_fglarge_utt/wer_13_1.0 +%WER 5.09 [ 2676 / 52576, 405 ins, 241 del, 2030 sub ] exp/nnet2_online/nnet_ms_a_online/decode_test_clean_fglarge_utt_offline/wer_13_0.5 +%WER 5.22 [ 2744 / 52576, 393 ins, 282 del, 2069 sub ] exp/nnet2_online/nnet_ms_a_online/decode_test_clean_tglarge/wer_13_0.5 +%WER 5.38 [ 2826 / 52576, 413 ins, 284 del, 2129 sub ] exp/nnet2_online/nnet_ms_a_online/decode_test_clean_tglarge_utt/wer_13_0.5 +%WER 5.24 [ 2757 / 52576, 453 ins, 229 del, 2075 sub ] exp/nnet2_online/nnet_ms_a_online/decode_test_clean_tglarge_utt_offline/wer_13_0.0 +%WER 6.26 [ 3289 / 52576, 436 ins, 345 del, 2508 sub ] exp/nnet2_online/nnet_ms_a_online/decode_test_clean_tgmed/wer_13_0.0 +%WER 6.54 [ 3441 / 52576, 435 ins, 381 del, 2625 sub ] exp/nnet2_online/nnet_ms_a_online/decode_test_clean_tgmed_utt/wer_14_0.0 +%WER 6.28 [ 3303 / 52576, 426 ins, 359 del, 2518 sub ] exp/nnet2_online/nnet_ms_a_online/decode_test_clean_tgmed_utt_offline/wer_14_0.0 +%WER 7.06 [ 3711 / 52576, 446 ins, 474 del, 2791 sub ] exp/nnet2_online/nnet_ms_a_online/decode_test_clean_tgsmall/wer_14_0.0 +%WER 7.31 [ 3845 / 52576, 510 ins, 426 del, 2909 sub ] exp/nnet2_online/nnet_ms_a_online/decode_test_clean_tgsmall_utt/wer_12_0.0 +%WER 7.08 [ 3723 / 52576, 460 ins, 445 del, 2818 sub ] exp/nnet2_online/nnet_ms_a_online/decode_test_clean_tgsmall_utt_offline/wer_13_0.0 +%WER 13.17 [ 6891 / 52343, 936 ins, 713 del, 5242 sub ] exp/nnet2_online/nnet_ms_a_online/decode_test_other_fglarge/wer_14_0.0 +%WER 14.20 [ 7432 / 52343, 832 ins, 983 del, 5617 sub ] exp/nnet2_online/nnet_ms_a_online/decode_test_other_fglarge_utt/wer_15_0.5 +%WER 13.26 [ 6939 / 52343, 837 ins, 860 del, 5242 sub ] exp/nnet2_online/nnet_ms_a_online/decode_test_other_fglarge_utt_offline/wer_14_0.5 +%WER 13.53 [ 7080 / 52343, 952 ins, 779 del, 5349 sub ] exp/nnet2_online/nnet_ms_a_online/decode_test_other_tglarge/wer_14_0.0 +%WER 14.77 [ 7730 / 52343, 877 ins, 1056 del, 5797 sub ] exp/nnet2_online/nnet_ms_a_online/decode_test_other_tglarge_utt/wer_15_0.5 +%WER 13.74 [ 7192 / 52343, 871 ins, 920 del, 5401 sub ] exp/nnet2_online/nnet_ms_a_online/decode_test_other_tglarge_utt_offline/wer_14_0.5 +%WER 15.78 [ 8259 / 52343, 898 ins, 1170 del, 6191 sub ] exp/nnet2_online/nnet_ms_a_online/decode_test_other_tgmed/wer_15_0.0 +%WER 16.97 [ 8884 / 52343, 939 ins, 1304 del, 6641 sub ] exp/nnet2_online/nnet_ms_a_online/decode_test_other_tgmed_utt/wer_16_0.0 +%WER 16.01 [ 8380 / 52343, 877 ins, 1210 del, 6293 sub ] exp/nnet2_online/nnet_ms_a_online/decode_test_other_tgmed_utt_offline/wer_16_0.0 +%WER 16.98 [ 8889 / 52343, 900 ins, 1283 del, 6706 sub ] exp/nnet2_online/nnet_ms_a_online/decode_test_other_tgsmall/wer_14_0.0 +%WER 18.21 [ 9533 / 52343, 966 ins, 1398 del, 7169 sub ] exp/nnet2_online/nnet_ms_a_online/decode_test_other_tgsmall_utt/wer_14_0.0 +%WER 17.29 [ 9050 / 52343, 894 ins, 1391 del, 6765 sub ] exp/nnet2_online/nnet_ms_a_online/decode_test_other_tgsmall_utt_offline/wer_15_0.0 ## Note: this learning rate is the effective learning rate; it gets multiplied by the num-jobs. # for x in exp/nnet2_online/nnet_ms_a_smbr_0.000005/decode_epoch*{clean,other}*; do grep WER $x/wer_* | utils/best_wer.sh ; done @@ -323,3 +421,45 @@ %WER 13.79 [ 7219 / 52343, 847 ins, 953 del, 5419 sub ] exp/nnet2_online/nnet_ms_a_online/decode_pp_test_other_tglarge_utt_offline/wer_13 %WER 16.08 [ 8416 / 52343, 746 ins, 1466 del, 6204 sub ] exp/nnet2_online/nnet_ms_a_online/decode_pp_test_other_tgmed_utt_offline/wer_15 %WER 17.64 [ 9231 / 52343, 764 ins, 1662 del, 6805 sub ] exp/nnet2_online/nnet_ms_a_online/decode_pp_test_other_tgsmall_utt_offline/wer_14 + +# Results with nnet3 tdnn +# local/nnet3/run_tdnn.sh +# (4 epoch training on speed-perturbed data) +# num_params=19.3M +%WER 4.43 [ 2410 / 54402, 306 ins, 278 del, 1826 sub ] exp/nnet3/tdnn_sp/decode_dev_clean_fglarge/wer_13_1.0 +%WER 4.63 [ 2520 / 54402, 369 ins, 259 del, 1892 sub ] exp/nnet3/tdnn_sp/decode_dev_clean_tglarge/wer_12_0.5 +%WER 5.90 [ 3211 / 54402, 430 ins, 337 del, 2444 sub ] exp/nnet3/tdnn_sp/decode_dev_clean_tgmed/wer_12_0.0 +%WER 6.66 [ 3622 / 54402, 450 ins, 415 del, 2757 sub ] exp/nnet3/tdnn_sp/decode_dev_clean_tgsmall/wer_12_0.0 +%WER 11.62 [ 5922 / 50948, 727 ins, 741 del, 4454 sub ] exp/nnet3/tdnn_sp/decode_dev_other_fglarge/wer_14_0.5 +%WER 12.19 [ 6209 / 50948, 863 ins, 682 del, 4664 sub ] exp/nnet3/tdnn_sp/decode_dev_other_tglarge/wer_14_0.0 +%WER 14.52 [ 7396 / 50948, 789 ins, 1079 del, 5528 sub ] exp/nnet3/tdnn_sp/decode_dev_other_tgmed/wer_16_0.0 +%WER 15.83 [ 8063 / 50948, 867 ins, 1141 del, 6055 sub ] exp/nnet3/tdnn_sp/decode_dev_other_tgsmall/wer_14_0.0 +%WER 4.97 [ 2614 / 52576, 373 ins, 271 del, 1970 sub ] exp/nnet3/tdnn_sp/decode_test_clean_fglarge/wer_14_0.5 +%WER 5.15 [ 2708 / 52576, 446 ins, 235 del, 2027 sub ] exp/nnet3/tdnn_sp/decode_test_clean_tglarge/wer_13_0.0 +%WER 6.24 [ 3281 / 52576, 467 ins, 336 del, 2478 sub ] exp/nnet3/tdnn_sp/decode_test_clean_tgmed/wer_12_0.0 +%WER 6.95 [ 3654 / 52576, 459 ins, 433 del, 2762 sub ] exp/nnet3/tdnn_sp/decode_test_clean_tgsmall/wer_13_0.0 +%WER 12.14 [ 6352 / 52343, 883 ins, 649 del, 4820 sub ] exp/nnet3/tdnn_sp/decode_test_other_fglarge/wer_13_0.0 +%WER 12.62 [ 6605 / 52343, 898 ins, 720 del, 4987 sub ] exp/nnet3/tdnn_sp/decode_test_other_tglarge/wer_13_0.0 +%WER 15.10 [ 7904 / 52343, 874 ins, 1070 del, 5960 sub ] exp/nnet3/tdnn_sp/decode_test_other_tgmed/wer_13_0.0 +%WER 16.29 [ 8528 / 52343, 828 ins, 1320 del, 6380 sub ] exp/nnet3/tdnn_sp/decode_test_other_tgsmall/wer_14_0.0 + +# Results with nnet3 tdnn+chain model +# local/chain/run_tdnn_6z.sh +# (4 epoch training on speed-perturbed data) +# num_params=16.8M (12.7M after excluding the xent branch) +%WER 3.92 [ 2131 / 54402, 290 ins, 197 del, 1644 sub ] exp/chain/tdnn_6z_sp/decode_dev_clean_fglarge/wer_11_0.5 +%WER 4.09 [ 2227 / 54402, 337 ins, 176 del, 1714 sub ] exp/chain/tdnn_6z_sp/decode_dev_clean_tglarge/wer_11_0.0 +%WER 5.11 [ 2781 / 54402, 329 ins, 300 del, 2152 sub ] exp/chain/tdnn_6z_sp/decode_dev_clean_tgmed/wer_12_0.0 +%WER 5.83 [ 3172 / 54402, 335 ins, 372 del, 2465 sub ] exp/chain/tdnn_6z_sp/decode_dev_clean_tgsmall/wer_12_0.0 +%WER 10.43 [ 5314 / 50948, 528 ins, 697 del, 4089 sub ] exp/chain/tdnn_6z_sp/decode_dev_other_fglarge/wer_14_0.5 +%WER 10.95 [ 5581 / 50948, 546 ins, 764 del, 4271 sub ] exp/chain/tdnn_6z_sp/decode_dev_other_tglarge/wer_14_0.5 +%WER 13.20 [ 6723 / 50948, 676 ins, 858 del, 5189 sub ] exp/chain/tdnn_6z_sp/decode_dev_other_tgmed/wer_13_0.0 +%WER 14.56 [ 7419 / 50948, 715 ins, 1003 del, 5701 sub ] exp/chain/tdnn_6z_sp/decode_dev_other_tgsmall/wer_13_0.0 +%WER 4.28 [ 2251 / 52576, 292 ins, 238 del, 1721 sub ] exp/chain/tdnn_6z_sp/decode_test_clean_fglarge/wer_11_1.0 +%WER 4.47 [ 2349 / 52576, 342 ins, 225 del, 1782 sub ] exp/chain/tdnn_6z_sp/decode_test_clean_tglarge/wer_11_0.5 +%WER 5.55 [ 2917 / 52576, 366 ins, 314 del, 2237 sub ] exp/chain/tdnn_6z_sp/decode_test_clean_tgmed/wer_13_0.0 +%WER 6.20 [ 3259 / 52576, 383 ins, 381 del, 2495 sub ] exp/chain/tdnn_6z_sp/decode_test_clean_tgsmall/wer_12_0.0 +%WER 10.76 [ 5634 / 52343, 643 ins, 672 del, 4319 sub ] exp/chain/tdnn_6z_sp/decode_test_other_fglarge/wer_12_0.5 +%WER 11.20 [ 5864 / 52343, 619 ins, 781 del, 4464 sub ] exp/chain/tdnn_6z_sp/decode_test_other_tglarge/wer_13_0.5 +%WER 13.47 [ 7051 / 52343, 733 ins, 933 del, 5385 sub ] exp/chain/tdnn_6z_sp/decode_test_other_tgmed/wer_13_0.0 +%WER 14.73 [ 7710 / 52343, 662 ins, 1209 del, 5839 sub ] exp/chain/tdnn_6z_sp/decode_test_other_tgsmall/wer_14_0.0 diff --git a/egs/librispeech/s5/cmd.sh b/egs/librispeech/s5/cmd.sh index 6395d96ca36..71dd849a93b 100644 --- a/egs/librispeech/s5/cmd.sh +++ b/egs/librispeech/s5/cmd.sh @@ -1,30 +1,15 @@ -# "queue.pl" uses qsub. The options to it are -# options to qsub. If you have GridEngine installed, -# change this to a queue you have access to. -# Otherwise, use "run.pl", which will run jobs locally -# (make sure your --num-jobs options are no more than -# the number of cpus on your machine. - -#a) JHU cluster options -export train_cmd="queue.pl -l arch=*64" -export decode_cmd="queue.pl -l arch=*64 --mem 2G" -export mkgraph_cmd="queue.pl -l arch=*64 --mem 4G" -export big_memory_cmd="queue.pl -l arch=*64 --mem 8G" -export cuda_cmd="queue.pl -l gpu=1" - - - -#b) BUT cluster options -#export train_cmd="queue.pl -q all.q@@blade -l ram_free=1200M,mem_free=1200M" -#export decode_cmd="queue.pl -q all.q@@blade -l ram_free=1700M,mem_free=1700M" -#export decodebig_cmd="queue.pl -q all.q@@blade -l ram_free=4G,mem_free=4G" - -#export cuda_cmd="queue.pl -q long.q@@pco203 -l gpu=1" -#export cuda_cmd="queue.pl -q long.q@pcspeech-gpu" -#export mkgraph_cmd="queue.pl -q all.q@@servers -l ram_free=4G,mem_free=4G" - -#c) run it locally... -#export train_cmd=run.pl -#export decode_cmd=run.pl -#export cuda_cmd=run.pl -#export mkgraph_cmd=run.pl +# you can change cmd.sh depending on what type of queue you are using. +# If you have no queueing system and want to run on a local machine, you +# can change all instances 'queue.pl' to run.pl (but be careful and run +# commands one by one: most recipes will exhaust the memory on your +# machine). queue.pl works with GridEngine (qsub). slurm.pl works +# with slurm. Different queues are configured differently, with different +# queue names and different ways of specifying things like memory; +# to account for these differences you can create and edit the file +# conf/queue.conf to match your queue's configuration. Search for +# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, +# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. + +export train_cmd="queue.pl --mem 2G" +export decode_cmd="queue.pl --mem 4G" +export mkgraph_cmd="queue.pl --mem 8G" diff --git a/egs/librispeech/s5/local/chain/run_chain_common.sh b/egs/librispeech/s5/local/chain/run_chain_common.sh new file mode 100755 index 00000000000..ab8b065ddd3 --- /dev/null +++ b/egs/librispeech/s5/local/chain/run_chain_common.sh @@ -0,0 +1,136 @@ +#!/bin/bash + +# this script has common stages shared across librispeech chain recipes +set -e + +# configs for 'chain' +stage=0 +# chain options +frames_per_eg=150 +max_wer= + +# output directory names +dir= +treedir= +lang= +min_seg_len= +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +[ -z $treedir ] && echo "Set --treedir, this specifies the directory to store new tree " && exit 1; +[ -z $lang ] && echo "Set --lang, this specifies the new lang directory which will have the new topology" && exit 1; +[ -z $dir ] && echo "Set --dir, this specifies the experiment directory to store files relevant to the experiment " && exit 1; + +# The iVector-extraction and feature-dumping parts are the same as the standard +# nnet3 setup, and you can skip them by setting "--stage 10" if you have already +# run those things. + +local/nnet3/run_ivector_common.sh --stage $stage \ + --speed-perturb true \ + --generate-alignments false || exit 1; + + +# Set the variables. These are based on variables set by run_ivector_common.sh +gmm_dir=exp/tri6b +train_set=train_960_sp +latgen_train_set=train_960_sp +ali_dir=exp/tri6b_sp +lat_dir=exp/tri6b_lats_sp + +################################### + +if [ $stage -le 10 ]; then + # Create a version of the lang/ directory that has one state per phone in the + # topo file. [note, it really has two states.. the first one is only repeated + # once, the second one has zero or more repeats.] + rm -rf $lang + cp -r data/lang $lang + silphonelist=$(cat $lang/phones/silence.csl) || exit 1; + nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1; + # Use our special topology... note that later on may have to tune this + # topology. + steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate -1 \ + --cmd "$train_cmd" 6000 data/$latgen_train_set $lang $ali_dir $treedir +fi + +# combining the segments in training data to have a minimum length of frames_per_eg + tolerance +# this is critical stage in AMI (gives 1% absolute improvement) +if [ -z $min_seg_len ]; then + min_seg_len=$(python -c "print ($frames_per_eg+5)/100.0") +fi + +if [ $stage -le 12 ]; then + rm -rf data/${train_set}_min${min_seg_len}_hires + steps/cleanup/combine_short_segments.py --minimum-duration $min_seg_len \ + --input-data-dir data/${train_set}_hires \ + --output-data-dir data/${train_set}_min${min_seg_len}_hires + + #extract ivectors for the new data + steps/online/nnet2/copy_data_dir.sh --utts-per-spk-max 2 \ + data/${train_set}_min${min_seg_len}_hires data/${train_set}_min${min_seg_len}_hires_max2 + steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 30 \ + data/${train_set}_min${min_seg_len}_hires_max2 \ + exp/nnet3/extractor \ + exp/nnet3/ivectors_${train_set}_min${min_seg_len} || exit 1; + + # combine the non-hires features for alignments/lattices + rm -rf data/${latgen_train_set}_min${min_seg_len} + steps/cleanup/combine_short_segments.py --minimum-duration $min_seg_len \ + --input-data-dir data/${latgen_train_set} \ + --output-data-dir data/${latgen_train_set}_min${min_seg_len} +fi + +train_set=${train_set}_min${min_seg_len} +latgen_train_set=${latgen_train_set}_min${min_seg_len} +ivector_dir=exp/nnet3/ivectors_${train_set} +ali_dir=${ali_dir}_min${min_seg_len} +lat_dir=${lat_dir}_min${min_seg_len} +if [ $stage -le 13 ]; then + # realigning data as the segments would have changed + steps/align_fmllr.sh --nj 100 --cmd "$train_cmd" data/$latgen_train_set data/lang $gmm_dir $ali_dir || exit 1; +fi + +if [ $stage -le 14 ]; then + # Get the alignments as lattices (gives the chain training more freedom). + # use the same num-jobs as the alignments + nj=$(cat ${ali_dir}/num_jobs) || exit 1; + steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$latgen_train_set \ + data/lang $gmm_dir $lat_dir + rm $lat_dir/fsts.*.gz # save space +fi + +mkdir -p $dir +train_data_dir=data/${train_set}_hires +if [ ! -z $max_wer ]; then + if [ $stage -le 15 ]; then + bad_utts_dir=${gmm_dir}_${train_set}_bad_utts + if [ ! -f $bad_utts_dir/all_info.sorted.txt ]; then + # This stage takes a lot of time ~7hrs, so run only if file is not available already + steps/cleanup/find_bad_utts.sh --cmd "$decode_cmd" --nj 405 data/$latgen_train_set data/lang $ali_dir $bad_utts_dir + fi + python local/sort_bad_utts.py --bad-utt-info-file $bad_utts_dir/all_info.sorted.txt --max-wer $max_wer --output-file $dir/wer_sorted_utts_${max_wer}wer + utils/copy_data_dir.sh --validate-opts "--no-wav" data/${train_set}_hires data/${train_set}_${max_wer}wer_hires + utils/filter_scp.pl $dir/wer_sorted_utts_${max_wer}wer data/${train_set}_hires/feats.scp > data/${train_set}_${max_wer}wer_hires/feats.scp + utils/fix_data_dir.sh data/${train_set}_${max_wer}wer_hires + fi + train_data_dir=data/${train_set}_${max_wer}wer_hires + # we don't realign again as the segment ids don't change +fi + +cat > $dir/vars < from the graph + fstrmsymbols --apply-to-output=true --remove-arcs=true "echo 3|" $dir/graph_test_tgsmall/HCLG.fst $dir/graph_test_tgsmall/HCLG.fst +fi + +graph_dir=$dir/graph_test_tgsmall +if [ $stage -le 19 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + iter_opts=" --iter $decode_iter " + fi + for decode_set in test_clean test_other dev_clean dev_other; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj 50 --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_tgsmall || touch $dir/.error + steps/lmrescore.sh --cmd "$decode_cmd" --self-loop-scale 1.0 data/lang_test_{tgsmall,tgmed} \ + data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_{tgsmall,tgmed} || touch $dir/.error + steps/lmrescore_const_arpa.sh \ + --cmd "$decode_cmd" data/lang_test_{tgsmall,tglarge} \ + data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_{tgsmall,tglarge} || touch $dir/.error + steps/lmrescore_const_arpa.sh \ + --cmd "$decode_cmd" data/lang_test_{tgsmall,fglarge} \ + data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_{tgsmall,fglarge} || touch $dir/.error + ) & + done +fi +wait; +exit 0; diff --git a/egs/librispeech/s5/local/chain/run_tdnn_6z_discriminative.sh b/egs/librispeech/s5/local/chain/run_tdnn_6z_discriminative.sh new file mode 100755 index 00000000000..944cfe255da --- /dev/null +++ b/egs/librispeech/s5/local/chain/run_tdnn_6z_discriminative.sh @@ -0,0 +1,234 @@ +#!/bin/bash + +set -o pipefail +set -e +# this is run_discriminative.sh + +# This script does discriminative training on top of chain nnet3 system. +# note: this relies on having a cluster that has plenty of CPUs as well as GPUs, +# since the lattice generation runs in about real-time, so takes of the order of +# 1000 hours of CPU time. +# + + +stage=0 +train_stage=-10 # can be used to start training in the middle. +get_egs_stage=-10 +use_gpu=true # for training +cleanup=false # run with --cleanup true --stage 6 to clean up (remove large things like denlats, + # alignments and degs). + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +srcdir=exp/chain/tdnn_6z_sp +. $srcdir/vars +# sets the directory names where features, ivectors and lattices are stored +#train_data_dir +#train_ivector_dir +#lat_dir + +online_ivector_dir=$train_ivector_dir +degs_dir= # If provided, will skip the degs directory creation +lats_dir= # If provided, will skip denlats creation + +## Objective options +criterion=smbr +one_silence_class=true + +dir=${srcdir}_${criterion} + +## Egs options +frames_per_eg=150 +frames_overlap_per_eg=30 +truncate_deriv_weights=10 + +## Nnet training options +effective_learning_rate=0.00000125 +max_param_change=1 +num_jobs_nnet=4 +num_epochs=4 +regularization_opts="--xent-regularize=0.1 --l2-regularize=0.00005" # Applicable for providing --xent-regularize and --l2-regularize options +minibatch_size=64 +modify_learning_rates=true +last_layer_factor=0.1 + +## Decode options +decode_start_epoch=1 # can be used to avoid decoding all epochs, e.g. if we decided to run more. + +if $use_gpu; then + if ! cuda-compiled; then + cat </dev/null || true + + data_dirs= + for x in `seq -$[frame_subsampling_factor/2] $[frame_subsampling_factor/2]`; do + steps/shift_feats.sh --cmd "$train_cmd --max-jobs-run 40" --nj 350 \ + $x $train_data_dir exp/shift_hires mfcc_hires + utils/fix_data_dir.sh ${train_data_dir}_fs$x + data_dirs="$data_dirs ${train_data_dir}_fs$x" + awk -v nfs=$x '{print "fs"nfs"-"$0}' $online_ivector_dir/ivector_online.scp >> ${online_ivector_dir}_fs/ivector_online.scp + done + utils/combine_data.sh ${train_data_dir}_fs $data_dirs + for x in `seq -$[frame_subsampling_factor/2] $[frame_subsampling_factor/2]`; do + rm -r ${train_data_dir}_fs$x + done + fi + + train_data_dir=${train_data_dir}_fs + + affix=_fs +fi + +rm ${online_ivector_dir}_fs/ivector_online.scp 2>/dev/null || true +for x in `seq -$[frame_subsampling_factor/2] $[frame_subsampling_factor/2]`; do + awk -v nfs=$x '{print "fs"nfs"-"$0}' $online_ivector_dir/ivector_online.scp >> ${online_ivector_dir}_fs/ivector_online.scp +done +online_ivector_dir=${online_ivector_dir}_fs + +if [ $stage -le 1 ]; then + # hardcode no-GPU for alignment, although you could use GPU [you wouldn't + # get excellent GPU utilization though.] + nj=350 # have a high number of jobs because this could take a while, and we might + # have some stragglers. + steps/nnet3/align.sh --cmd "$decode_cmd" --use-gpu false \ + --online-ivector-dir $online_ivector_dir \ + --scale-opts "--transition-scale=1.0 --acoustic-scale=1.0 --self-loop-scale=1.0" \ + --nj $nj $train_data_dir $lang $srcdir ${srcdir}_ali${affix} ; +fi + +if [ -z "$lats_dir" ]; then + lats_dir=${srcdir}_denlats${affix} + if [ $stage -le 2 ]; then + nj=50 + # this doesn't really affect anything strongly, except the num-jobs for one of + # the phases of get_egs_discriminative.sh below. + num_threads_denlats=6 + subsplit=40 # number of jobs that run per job (but 2 run at a time, so total jobs is 80, giving + # total slots = 80 * 6 = 480. + steps/nnet3/make_denlats.sh --cmd "$decode_cmd" \ + --self-loop-scale 1.0 --acwt 1.0 --determinize true \ + --online-ivector-dir $online_ivector_dir \ + --nj $nj --sub-split $subsplit --num-threads "$num_threads_denlats" --config conf/decode.config \ + $train_data_dir $lang $srcdir ${lats_dir} ; + fi +fi + +model_left_context=`nnet3-am-info $srcdir/final.mdl | grep "left-context:" | awk '{print $2}'` +model_right_context=`nnet3-am-info $srcdir/final.mdl | grep "right-context:" | awk '{print $2}'` + +left_context=$[model_left_context + extra_left_context] +right_context=$[model_right_context + extra_right_context] + +valid_left_context=$[valid_left_context + frames_per_eg] +valid_right_context=$[valid_right_context + frames_per_eg] + +cmvn_opts=`cat $srcdir/cmvn_opts` + +if [ -z "$degs_dir" ]; then + degs_dir=${srcdir}_degs${affix} + + if [ $stage -le 3 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d ${srcdir}_degs/storage ]; then + utils/create_split_dir.pl \ + /export/b{01,02,12,13}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5/${srcdir}_degs/storage ${srcdir}_degs/storage + fi + # have a higher maximum num-jobs if + if [ -d ${srcdir}_degs/storage ]; then max_jobs=10; else max_jobs=5; fi + + degs_opts="--determinize true --minimize true --remove-output-symbols true --remove-epsilons true --collapse-transition-ids true" + + steps/nnet3/get_egs_discriminative.sh \ + --cmd "$decode_cmd --max-jobs-run $max_jobs --mem 20G" --stage $get_egs_stage --cmvn-opts "$cmvn_opts" \ + --adjust-priors false --acwt 1.0 \ + --online-ivector-dir $online_ivector_dir \ + --left-context $left_context --right-context $right_context \ + --valid-left-context $valid_left_context --valid-right-context $valid_right_context \ + --priors-left-context $valid_left_context --priors-right-context $valid_right_context $frame_subsampling_opt \ + --frames-per-eg $frames_per_eg --frames-overlap-per-eg $frames_overlap_per_eg ${degs_opts} \ + $train_data_dir $lang ${srcdir}_ali${affix} $lats_dir $srcdir/final.mdl $degs_dir ; + fi +fi + +if [ $stage -le 4 ]; then + steps/nnet3/train_discriminative.sh --cmd "$decode_cmd" \ + --stage $train_stage \ + --effective-lrate $effective_learning_rate --max-param-change $max_param_change \ + --criterion $criterion --drop-frames true --acoustic-scale 1.0 \ + --num-epochs $num_epochs --one-silence-class $one_silence_class --minibatch-size $minibatch_size \ + --num-jobs-nnet $num_jobs_nnet --num-threads $num_threads \ + --regularization-opts "$regularization_opts" --use-frame-shift false \ + --truncate-deriv-weights $truncate_deriv_weights --adjust-priors false \ + --modify-learning-rates $modify_learning_rates --last-layer-factor $last_layer_factor \ + ${degs_dir} $dir ; +fi + +graph_dir=$srcdir/graph_tgsmall +if [ $stage -le 5 ]; then + for x in `seq $decode_start_epoch $num_epochs`; do + for decode_set in test_clean test_other dev_clean dev_other; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + iter=epoch$x.adj + + steps/nnet3/decode.sh --nj $num_jobs --cmd "$decode_cmd" --iter $iter \ + --acwt 1.0 --post-decode-acwt 10.0 \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_tgsmall_$iter || touch $dir/.error; + steps/lmrescore.sh --cmd "$decode_cmd" data/lang_test_{tgsmall,tgmed} \ + data/${decode_set}_hires $dir/decode_${decode_set}_{tgsmall,tgmed}_$iter || touch $dir/.error + steps/lmrescore_const_arpa.sh \ + --cmd "$decode_cmd" data/lang_test_{tgsmall,tglarge} \ + data/${decode_set}_hires $dir/decode_${decode_set}_{tgsmall,tglarge}_$iter || touch $dir/.error + steps/lmrescore_const_arpa.sh \ + --cmd "$decode_cmd" data/lang_test_{tgsmall,fglarge} \ + data/${decode_set}_hires $dir/decode_${decode_set}_{tgsmall,fglarge}_$iter || touch $dir/.error + ) & + done + done +fi +wait; + +if [ $stage -le 6 ] && $cleanup; then + # if you run with "--cleanup true --stage 6" you can clean up. + rm ${lats_dir}/lat.*.gz || true + rm ${srcdir}_ali/ali.*.gz || true + steps/nnet2/remove_egs.sh ${srcdir}_degs || true +fi + + +exit 0; + diff --git a/egs/librispeech/s5/local/data_prep.sh b/egs/librispeech/s5/local/data_prep.sh index a46e2de4f04..5a264a07464 100755 --- a/egs/librispeech/s5/local/data_prep.sh +++ b/egs/librispeech/s5/local/data_prep.sh @@ -1,6 +1,6 @@ #!/bin/bash -# Copyright 2014 Vassil Panayotov +# Copyright 2014 Vassil Panayotov # 2014 Johns Hopkins University (author: Daniel Povey) # Apache 2.0 @@ -31,6 +31,7 @@ wav_scp=$dst/wav.scp; [[ -f "$wav_scp" ]] && rm $wav_scp trans=$dst/text; [[ -f "$trans" ]] && rm $trans utt2spk=$dst/utt2spk; [[ -f "$utt2spk" ]] && rm $utt2spk spk2gender=$dst/spk2gender; [[ -f $spk2gender ]] && rm $spk2gender +utt2dur=$dst/utt2dur; [[ -f "$utt2dur" ]] && rm $utt2dur for reader_dir in $(find $src -mindepth 1 -maxdepth 1 -type d | sort); do reader=$(basename $reader_dir) @@ -78,6 +79,8 @@ nutt2spk=$(wc -l <$utt2spk) ! [ "$ntrans" -eq "$nutt2spk" ] && \ echo "Inconsistent #transcripts($ntrans) and #utt2spk($nutt2spk)" && exit 1; +utils/data/get_utt2dur.sh $dst 1>&2 || exit 1 + utils/validate_data_dir.sh --no-feats $dst || exit 1; echo "$0: successfully prepared data in $dst" diff --git a/egs/librispeech/s5/local/decode_example.sh b/egs/librispeech/s5/local/decode_example.sh index 11a0670f240..815bf17b9f7 100755 --- a/egs/librispeech/s5/local/decode_example.sh +++ b/egs/librispeech/s5/local/decode_example.sh @@ -34,22 +34,10 @@ mfccdir=mfcc # here. lang=data/lang lang_test=data/lang_test -lang_test_tmp=data/local/lang_test_tmp/ -mkdir -p $lang_test_tmp mkdir -p $lang_test cp -r $lang/* $lang_test -gunzip -c $lm | utils/find_arpa_oovs.pl $lang_test/words.txt \ - > $lang_test_tmp/oovs.txt || exit 1 -gunzip -c $lm | \ - grep -v ' ' | \ - grep -v ' ' | \ - grep -v ' ' | \ - arpa2fst - | fstprint | \ - utils/remove_oovs.pl $lang_test_tmp/oovs.txt | \ - utils/eps2disambig.pl | utils/s2eps.pl | \ - fstcompile --isymbols=$lang_test/words.txt --osymbols=$lang_test/words.txt \ - --keep_isymbols=false --keep_osymbols=false | \ - fstrmepsilon | fstarcsort --sort_type=ilabel > $lang_test/G.fst +gunzip -c $lm | arpa2fst --disambig-symbol=#0 \ + --read-symbol-table=$lang_test/words.txt - $lang_test/G.fst utils/validate_lang.pl --skip-determinization-check $lang_test || exit 1; # Compiles decoding graph. diff --git a/egs/librispeech/s5/local/format_data.sh b/egs/librispeech/s5/local/format_data.sh index 52159f5e500..64914bde42d 100755 --- a/egs/librispeech/s5/local/format_data.sh +++ b/egs/librispeech/s5/local/format_data.sh @@ -18,40 +18,23 @@ fi lm_dir=$1 -tmpdir=data/local/lm_tmp lexicon=data/local/lang_tmp/lexiconp.txt -mkdir -p $tmpdir # This loop was taken verbatim from wsj_format_data.sh, and I'm leaving it in place in # case we decide to add more language models at some point for lm_suffix in tgpr; do test=data/lang_test_${lm_suffix} mkdir -p $test - for f in phones.txt words.txt phones.txt L.fst L_disambig.fst phones oov.txt oov.int; do + for f in phones.txt words.txt phones.txt L.fst L_disambig.fst phones topo oov.txt oov.int; do cp -r data/lang/$f $test done - gunzip -c $lm_dir/lm_${lm_suffix}.arpa.gz |\ - utils/find_arpa_oovs.pl $test/words.txt > $tmpdir/oovs_${lm_suffix}.txt || exit 1 - - # grep -v ' ' because the LM seems to have some strange and useless - # stuff in it with multiple 's in the history. Encountered some other similar - # things in a LM from Geoff. Removing all "illegal" combinations of and , - # which are supposed to occur only at being/end of utt. These can cause - # determinization failures of CLG [ends up being epsilon cycles]. gunzip -c $lm_dir/lm_${lm_suffix}.arpa.gz | \ - grep -v ' ' | \ - grep -v ' ' | \ - grep -v ' ' | \ - arpa2fst - | fstprint | \ - utils/remove_oovs.pl $tmpdir/oovs_${lm_suffix}.txt | \ - utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$test/words.txt \ - --osymbols=$test/words.txt --keep_isymbols=false --keep_osymbols=false | \ - fstrmepsilon | fstarcsort --sort_type=ilabel > $test/G.fst + arpa2fst --disambig-symbol=#0 \ + --read-symbol-table=$test/words.txt - $test/G.fst utils/validate_lang.pl $test || exit 1; done echo "Succeeded in formatting data." -rm -r $tmpdir exit 0 diff --git a/egs/librispeech/s5/local/format_lms.sh b/egs/librispeech/s5/local/format_lms.sh index d83029b0e1f..b530f61d2d9 100755 --- a/egs/librispeech/s5/local/format_lms.sh +++ b/egs/librispeech/s5/local/format_lms.sh @@ -49,24 +49,9 @@ for lm_suffix in tgsmall tgmed; do test=${src_dir}_test_${lm_suffix} mkdir -p $test cp -r ${src_dir}/* $test - gunzip -c $lm_dir/lm_${lm_suffix}.arpa.gz |\ - utils/find_arpa_oovs.pl $test/words.txt > $tmpdir/oovs_${lm_suffix}.txt || exit 1 - - # grep -v ' ' because the LM seems to have some strange and useless - # stuff in it with multiple 's in the history. Encountered some other - # similar things in a LM from Geoff. Removing all "illegal" combinations of - # and , which are supposed to occur only at being/end of utt. These - # can cause determinization failures of CLG [ends up being epsilon cycles]. gunzip -c $lm_dir/lm_${lm_suffix}.arpa.gz | \ - grep -v ' ' | \ - grep -v ' ' | \ - grep -v ' ' | \ - arpa2fst - | fstprint | \ - utils/remove_oovs.pl $tmpdir/oovs_${lm_suffix}.txt | \ - utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$test/words.txt \ - --osymbols=$test/words.txt --keep_isymbols=false --keep_osymbols=false | \ - fstrmepsilon | fstarcsort --sort_type=ilabel > $test/G.fst - + arpa2fst --disambig-symbol=#0 \ + --read-symbol-table=$test/words.txt - $test/G.fst utils/validate_lang.pl --skip-determinization-check $test || exit 1; done diff --git a/egs/librispeech/s5/local/nnet3/run_ivector_common.sh b/egs/librispeech/s5/local/nnet3/run_ivector_common.sh new file mode 100755 index 00000000000..a82e26fefe7 --- /dev/null +++ b/egs/librispeech/s5/local/nnet3/run_ivector_common.sh @@ -0,0 +1,145 @@ +#!/bin/bash + +# this script contains some common (shared) parts of the run_nnet*.sh scripts. + +. cmd.sh + + +stage=0 +generate_alignments=true # false if doing ctc training +speed_perturb=true + +set -e +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + + +train_set=train_960 +if [ "$speed_perturb" == "true" ]; then + if [ $stage -le 1 ]; then + #Although the nnet will be trained by high resolution data, we still have to perturbe the normal data to get the alignment + # _sp stands for speed-perturbed + + for datadir in train_960; do + utils/perturb_data_dir_speed.sh 0.9 data/${datadir} data/temp1 + utils/perturb_data_dir_speed.sh 1.1 data/${datadir} data/temp2 + utils/combine_data.sh data/${datadir}_tmp data/temp1 data/temp2 + utils/validate_data_dir.sh --no-feats data/${datadir}_tmp + rm -r data/temp1 data/temp2 + + mfccdir=mfcc_perturbed + steps/make_mfcc.sh --cmd "$train_cmd" --nj 50 \ + data/${datadir}_tmp exp/make_mfcc/${datadir}_tmp $mfccdir || exit 1; + steps/compute_cmvn_stats.sh data/${datadir}_tmp exp/make_mfcc/${datadir}_tmp $mfccdir || exit 1; + utils/fix_data_dir.sh data/${datadir}_tmp + + utils/copy_data_dir.sh --spk-prefix sp1.0- --utt-prefix sp1.0- data/${datadir} data/temp0 + utils/combine_data.sh data/${datadir}_sp data/${datadir}_tmp data/temp0 + utils/fix_data_dir.sh data/${datadir}_sp + rm -r data/temp0 data/${datadir}_tmp + done + fi + + if [ $stage -le 2 ] && [ "$generate_alignments" == "true" ]; then + #obtain the alignment of the perturbed data + steps/align_fmllr.sh --nj 100 --cmd "$train_cmd" \ + data/train_960_sp data/lang exp/tri6b exp/tri6b_sp || exit 1 + fi + train_set=train_960_sp +fi + +if [ $stage -le 3 ]; then + # Create high-resolution MFCC features (with 40 cepstra instead of 13). + # this shows how you can split across multiple file-systems. we'll split the + # MFCC dir across multiple locations. You might want to be careful here, if you + # have multiple copies of Kaldi checked out and run the same recipe, not to let + # them overwrite each other. + mfccdir=mfcc_hires + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $mfccdir/storage ]; then + utils/create_split_dir.pl /export/b0{1,2,3,4}/$USER/kaldi-data/egs/librispeech-$(date +'%m_%d_%H_%M')/s5/$mfccdir/storage $mfccdir/storage + fi + + for datadir in $train_set test_clean test_other dev_clean dev_other; do + if [ "$datadir" == "$train_set" ]; then + utils/data/perturb_data_dir_volume.sh data/$datadir + fi + utils/copy_data_dir.sh data/$datadir data/${datadir}_hires + steps/make_mfcc.sh --nj 70 --mfcc-config conf/mfcc_hires.conf \ + --cmd "$train_cmd" data/${datadir}_hires exp/make_hires/$datadir $mfccdir || exit 1; + steps/compute_cmvn_stats.sh data/${datadir}_hires exp/make_hires/$datadir $mfccdir || exit 1; + done + + # now create some data subsets. + # mixed is the clean+other data. + # 30k is 1/10 of the data (around 100 hours), 60k is 1/5th of it (around 200 hours). + utils/subset_data_dir.sh data/${train_set}_hires 30000 data/${train_set}_mixed_hires_30k + utils/subset_data_dir.sh data/${train_set}_hires 60000 data/${train_set}_mixed_hires_60k +fi + +if [ $stage -le 4 ]; then + # We need to build a small system just because we need the LDA+MLLT transform + # to train the diag-UBM on top of. We align a subset of training data for + # this purpose. + utils/subset_data_dir.sh --utt-list <(awk '{print $1}' data/${train_set}_mixed_hires_30k/utt2spk) \ + data/${train_set} data/${train_set}_30k + + steps/align_fmllr.sh --nj 40 --cmd "$train_cmd" \ + data/${train_set}_30k data/lang exp/tri6b exp/nnet3/tri6b_ali_30k +fi + +if [ $stage -le 5 ]; then + # Train a small system just for its LDA+MLLT transform. We use --num-iters 13 + # because after we get the transform (12th iter is the last), any further + # training is pointless. + steps/train_lda_mllt.sh --cmd "$train_cmd" --num-iters 13 \ + --realign-iters "" \ + --splice-opts "--left-context=3 --right-context=3" \ + 5000 10000 data/${train_set}_mixed_hires_30k data/lang \ + exp/nnet3/tri6b_ali_30k exp/nnet3/tri7b +fi + + +if [ $stage -le 6 ]; then + mkdir -p exp/nnet3 + # To train a diagonal UBM we don't need very much data, so use a small subset + # (actually, it's not that small: still around 100 hours). + steps/online/nnet2/train_diag_ubm.sh --cmd "$train_cmd" --nj 30 --num-frames 700000 \ + data/${train_set}_mixed_hires_30k 512 exp/nnet3/tri7b exp/nnet3/diag_ubm +fi + +if [ $stage -le 7 ]; then + # iVector extractors can in general be sensitive to the amount of data, but + # this one has a fairly small dim (defaults to 100) so we don't use all of it, + # we use just the 60k subset (about one fifth of the data, or 200 hours). + steps/online/nnet2/train_ivector_extractor.sh --cmd "$train_cmd" --nj 10 \ + data/${train_set}_mixed_hires_60k exp/nnet3/diag_ubm exp/nnet3/extractor || exit 1; +fi + +if [ $stage -le 8 ]; then + ivectordir=exp/nnet3/ivectors_${train_set} + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $ivectordir/storage ]; then + utils/create_split_dir.pl /export/b{09,10,11,12}/$USER/kaldi-data/egs/librispeech-$(date +'%m_%d_%H_%M')/s5/$ivectordir/storage $ivectordir/storage + fi + # We extract iVectors on all the train data, which will be what we train the + # system on. With --utts-per-spk-max 2, the script. pairs the utterances + # into twos, and treats each of these pairs as one speaker. Note that these + # are extracted 'online'. + + # having a larger number of speakers is helpful for generalization, and to + # handle per-utterance decoding well (iVector starts at zero). + steps/online/nnet2/copy_data_dir.sh --utts-per-spk-max 2 data/${train_set}_hires data/${train_set}_hires_max2 + + steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 60 \ + data/${train_set}_hires_max2 exp/nnet3/extractor $ivectordir || exit 1; +fi + +if [ $stage -le 9 ]; then + for data in test_clean test_other dev_clean dev_other; do + steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 20 \ + data/${data}_hires exp/nnet3/extractor exp/nnet3/ivectors_${data} || exit 1; + done + wait +fi + +exit 0; diff --git a/egs/librispeech/s5/local/nnet3/run_tdnn.sh b/egs/librispeech/s5/local/nnet3/run_tdnn.sh new file mode 100755 index 00000000000..be253beda2f --- /dev/null +++ b/egs/librispeech/s5/local/nnet3/run_tdnn.sh @@ -0,0 +1,106 @@ +#!/bin/bash + +# this is the standard "tdnn" system, built in nnet3; it's what we use to +# call multi-splice. + +. cmd.sh + + +# At this script level we don't support not running on GPU, as it would be painfully slow. +# If you want to run without GPU you'd have to call train_tdnn.sh with --gpu false, +# --num-threads 16 and --minibatch-size 128. + +stage=0 +affix= +train_stage=-10 +common_egs_dir= +reporting_email= +remove_egs=true + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + + +if ! cuda-compiled; then + cat < is the directory in which the text corpus is downloaded" echo " is the directory in which the language model is stored" echo "Main options:" - echo " --hidden # default 150. Hidden layer size" - echo " --maxent-order # default 5. Maxent features order size" - echo " --maxent-size # default 1000. Maxent features hash size" + echo " --rnnlm-options # default '$rnnlm_options'. Command line arguments to pass to rnnlm" + echo " --rnnlm-tag # default '$rnnlm_tag' The tag is appended to exp/ folder name" echo " --num-threads # default 16. Number of concurrent threadss to train RNNLM" echo " --stage # 1 to download and prepare data, 2 to train RNNLM, 3 to rescore tri6b with a trained RNNLM" exit 1 @@ -36,51 +35,69 @@ fi s5_dir=`pwd` data_dir=`readlink -f $1` lm_dir=`readlink -f $2` -rnnlm_ver=rnnlm-hs-0.1b # Probably could make this an option, but Tomas's RNN will take long to train on 200K vocab -rnnlmdir=data/lang_rnnlm_h${hidden}_me${maxent_order}-${maxent_size} -export PATH=$KALDI_ROOT/tools/$rnnlm_ver:$PATH +modeldir=data/lang_${rnnlm_ver}_${rnnlm_tag} if [ $stage -le 1 ]; then echo "$0: Prepare training data for RNNLM" cd $data_dir - wget http://www.openslr.org/resources/11/librispeech-lm-norm.txt.gz - gunzip librispeech-lm-norm.txt.gz - $s5_dir/utils/filt.py $lm_dir/librispeech-vocab.txt librispeech-lm-norm.txt | shuf > librispeech-lm-norm.train.txt - $s5_dir/utils/filt.py $lm_dir/librispeech-vocab.txt <(awk '{$1=""; print $0}' $s5_dir/data/train_960/text) > librispeech-lm-norm.dev.txt - rm librispeech-lm-norm.txt + if [ -f "librispeech-lm-norm.dev.txt" ]; then + echo "$0: SKIP File librispeech-lm-norm.dev.txt already exists" + else + wget http://www.openslr.org/resources/11/librispeech-lm-norm.txt.gz + gunzip librispeech-lm-norm.txt.gz + $s5_dir/utils/filt.py $lm_dir/librispeech-vocab.txt librispeech-lm-norm.txt | shuf > librispeech-lm-norm.train.txt + $s5_dir/utils/filt.py $lm_dir/librispeech-vocab.txt <(awk '{$1=""; print $0}' $s5_dir/data/train_960/text) > librispeech-lm-norm.dev.txt.tmp + mv librispeech-lm-norm.dev.txt.tmp librispeech-lm-norm.dev.txt + rm librispeech-lm-norm.txt + fi cd $s5_dir - + fi if [ $stage -le 2 ]; then echo "$0: Training RNNLM. It will probably take several hours." - cd $KALDI_ROOT/tools - if [ -f $rnnlm_ver/rnnlm ]; then - echo "Not installing the rnnlm toolkit since it is already there." + $KALDI_ROOT/tools/extras/check_for_rnnlm.sh "$rnnlm_ver" || exit 1 + rnnlm_path="$(readlink -f $KALDI_ROOT)/tools/$rnnlm_ver/rnnlm" + cd $s5_dir + mkdir -p $modeldir + echo "$0: Model file: $modeldir/rnnlm" + if [ -f "$modeldir/rnnlm" ]; then + echo "$0: SKIP file '$modeldir/rnnlm' already exists" else - extras/install_rnnlm_hs.sh + rm -f $modeldir/rnnlm.tmp + rnnlm_cmd="$rnnlm_path" + if type taskset >/dev/null 2>&1 ; then + # HogWild works much faster if all threads are binded to the same phisical cpu + rnnlm_cmd="taskset -c $(seq -s, 0 $(( $num_threads - 1 )) ) $rnnlm_cmd" + fi + $rnnlm_cmd -rnnlm $modeldir/rnnlm.tmp \ + -train $data_dir/librispeech-lm-norm.train.txt \ + -valid $data_dir/librispeech-lm-norm.dev.txt \ + -threads $num_threads $rnnlm_options -retry 1 -stop 1.0 2>&1 | tee $modeldir/rnnlm.log + touch $modeldir/unk.probs + awk '{print $1}' $modeldir/rnnlm.tmp > $modeldir/wordlist.rnn + mv $modeldir/rnnlm.tmp $modeldir/rnnlm + mv $modeldir/rnnlm.tmp.nnet $modeldir/rnnlm.nnet fi - cd $s5_dir - mkdir -p $rnnlmdir - rnnlm -rnnlm $rnnlmdir/rnnlm -train $data_dir/librispeech-lm-norm.train.txt -valid $data_dir/librispeech-lm-norm.dev.txt \ - -threads $num_threads -hidden $hidden -direct-order $maxent_order -direct $maxent_size -retry 1 -stop 1.0 - touch $rnnlmdir/unk.probs - awk '{print $1}' $rnnlmdir/rnnlm > $rnnlmdir/wordlist.rnn fi if [ $stage -le 3 ]; then echo "$0: Performing RNNLM rescoring on tri6b decoding results" - for lm in tgsmall tgmed; do + for lm in tgsmall tgmed tglarge; do for devset in dev_clean dev_other; do sourcedir=exp/tri6b/decode_${lm}_${devset} - resultsdir=${sourcedir}_rnnlm_h${hidden}_me${maxent_order}-${maxent_size} - steps/rnnlmrescore.sh --rnnlm_ver $rnnlm_ver --N 100 0.5 data/lang_test_$lm $rnnlmdir data/$devset $sourcedir ${resultsdir}_L0.5 - cp -r ${resultsdir}_L0.5 ${resultsdir}_L0.25 - cp -r ${resultsdir}_L0.5 ${resultsdir}_L0.75 - steps/rnnlmrescore.sh --rnnlm_ver $rnnlm_ver --N 100 --stage 7 0.25 data/lang_test_$lm $rnnlmdir data/$devset $sourcedir ${resultsdir}_L0.25 - steps/rnnlmrescore.sh --rnnlm_ver $rnnlm_ver --N 100 --stage 7 0.75 data/lang_test_$lm $rnnlmdir data/$devset $sourcedir ${resultsdir}_L0.75 + if [ ! -d "$sourcedir" ]; then + echo "$0: WARNING cannot find source dir '$sourcedir' to rescore" + continue + fi + resultsdir=${sourcedir}_${rnnlm_ver}_${rnnlm_tag} + rm -rf ${resultsdir}_L0.5 + steps/rnnlmrescore.sh --skip_scoring false --rnnlm_ver $rnnlm_ver --N 100 0.5 data/lang_test_$lm $modeldir data/$devset $sourcedir ${resultsdir}_L0.5 + for coef in 0.25 0.75; do + rm -rf ${resultsdir}_L${coef} + cp -r ${resultsdir}_L0.5 ${resultsdir}_L${coef} + steps/rnnlmrescore.sh --skip_scoring false --rnnlm_ver $rnnlm_ver --N 100 --stage 7 $coef data/lang_test_$lm $modeldir data/$devset $sourcedir ${resultsdir}_L${coef} + done done done fi - - diff --git a/egs/librispeech/s5/local/score.sh b/egs/librispeech/s5/local/score.sh index f6359c189b4..3082c5eb9ee 100755 --- a/egs/librispeech/s5/local/score.sh +++ b/egs/librispeech/s5/local/score.sh @@ -13,6 +13,7 @@ reverse=false word_ins_penalty=0.0,0.5,1.0 min_lmwt=9 max_lmwt=20 +iter=final #end configuration section. [ -f ./path.sh ] && . ./path.sh diff --git a/egs/librispeech/s5/path.sh b/egs/librispeech/s5/path.sh index 74b6e31ad44..03df6dd9f2b 100755 --- a/egs/librispeech/s5/path.sh +++ b/egs/librispeech/s5/path.sh @@ -1,5 +1,7 @@ export KALDI_ROOT=`pwd`/../../.. -export PATH=$PWD/utils/:$KALDI_ROOT/src/bin:$KALDI_ROOT/tools/openfst/bin:$KALDI_ROOT/src/fstbin/:$KALDI_ROOT/src/gmmbin/:$KALDI_ROOT/src/featbin/:$KALDI_ROOT/src/lm/:$KALDI_ROOT/src/sgmmbin/:$KALDI_ROOT/src/sgmm2bin/:$KALDI_ROOT/src/fgmmbin/:$KALDI_ROOT/src/latbin/:$KALDI_ROOT/src/nnetbin:$KALDI_ROOT/src/nnet2bin/:$KALDI_ROOT/src/kwsbin:$KALDI_ROOT/src/online2bin/:$KALDI_ROOT/src/ivectorbin/:$KALDI_ROOT/src/lmbin/:$PWD:$PATH +export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH +[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1 +. $KALDI_ROOT/tools/config/common_path.sh export LC_ALL=C # we use this both in the (optional) LM training and the G2P-related scripts diff --git a/egs/librispeech/s5/run.sh b/egs/librispeech/s5/run.sh index 02880f3741b..5e969418c93 100755 --- a/egs/librispeech/s5/run.sh +++ b/egs/librispeech/s5/run.sh @@ -2,7 +2,7 @@ # Set this to somewhere where you want to put your data, or where -# someone else has already put it. You'll want to change this +# someone else has already put it. You'll want to change this # if you're not on the CLSP grid. data=/export/a15/vpanayotov/data @@ -10,8 +10,8 @@ data=/export/a15/vpanayotov/data data_url=www.openslr.org/resources/12 lm_url=www.openslr.org/resources/11 -. cmd.sh -. path.sh +. ./cmd.sh +. ./path.sh # you might not want to do this for interactive shells. set -e @@ -24,12 +24,12 @@ for part in dev-clean test-clean dev-other test-other train-clean-100; do done # download the LM resources -local/download_lm.sh $lm_url data/local/lm || exit 1 +local/download_lm.sh $lm_url data/local/lm # format the data as Kaldi data directories for part in dev-clean test-clean dev-other test-other train-clean-100; do # use underscore-separated names in data directories. - local/data_prep.sh $data/LibriSpeech/$part data/$(echo $part | sed s/-/_/g) || exit 1 + local/data_prep.sh $data/LibriSpeech/$part data/$(echo $part | sed s/-/_/g) done ## Optional text corpus normalization and LM training @@ -39,7 +39,7 @@ done ## well as some intermediate data(e.g. the normalized text used for LM training), ## are available for download at http://www.openslr.org/11/ #local/lm/train_lm.sh $LM_CORPUS_ROOT \ -# data/local/lm/norm/tmp data/local/lm/norm/norm_texts data/local/lm || exit 1 +# data/local/lm/norm/tmp data/local/lm/norm/norm_texts data/local/lm ## Optional G2P training scripts. ## As the LM training scripts above, this script is intended primarily to @@ -49,24 +49,24 @@ done # when "--stage 3" option is used below we skip the G2P steps, and use the # lexicon we have already downloaded from openslr.org/11/ local/prepare_dict.sh --stage 3 --nj 30 --cmd "$train_cmd" \ - data/local/lm data/local/lm data/local/dict_nosp || exit 1 + data/local/lm data/local/lm data/local/dict_nosp utils/prepare_lang.sh data/local/dict_nosp \ - "" data/local/lang_tmp_nosp data/lang_nosp || exit 1; + "" data/local/lang_tmp_nosp data/lang_nosp -local/format_lms.sh --src-dir data/lang_nosp data/local/lm || exit 1 +local/format_lms.sh --src-dir data/lang_nosp data/local/lm # Create ConstArpaLm format language model for full 3-gram and 4-gram LMs utils/build_const_arpa_lm.sh data/local/lm/lm_tglarge.arpa.gz \ - data/lang_nosp data/lang_nosp_test_tglarge || exit 1; + data/lang_nosp data/lang_nosp_test_tglarge utils/build_const_arpa_lm.sh data/local/lm/lm_fglarge.arpa.gz \ - data/lang_nosp data/lang_nosp_test_fglarge || exit 1; + data/lang_nosp data/lang_nosp_test_fglarge mfccdir=mfcc # spread the mfccs over various machines, as this data-set is quite large. -if [[ $(hostname -f) == *.clsp.jhu.edu ]]; then +if [[ $(hostname -f) == *.clsp.jhu.edu ]]; then mfcc=$(basename mfccdir) # in case was absolute pathname (unlikely), get basename. - utils/create_split_dir.pl /export/b0{1,2,3,4}/$USER/kaldi-data/egs/librispeech/s5/$mfcc/storage \ + utils/create_split_dir.pl /export/b{02,11,12,13}/$USER/kaldi-data/egs/librispeech/s5/$mfcc/storage \ $mfccdir/storage fi @@ -87,15 +87,15 @@ utils/subset_data_dir.sh data/train_clean_100 10000 data/train_10k # train a monophone system steps/train_mono.sh --boost-silence 1.25 --nj 20 --cmd "$train_cmd" \ - data/train_2kshort data/lang_nosp exp/mono || exit 1; + data/train_2kshort data/lang_nosp exp/mono # decode using the monophone model ( utils/mkgraph.sh --mono data/lang_nosp_test_tgsmall \ - exp/mono exp/mono/graph_nosp_tgsmall || exit 1 + exp/mono exp/mono/graph_nosp_tgsmall for test in test_clean test_other dev_clean dev_other; do steps/decode.sh --nj 20 --cmd "$decode_cmd" exp/mono/graph_nosp_tgsmall \ - data/$test exp/mono/decode_nosp_tgsmall_$test || exit 1 + data/$test exp/mono/decode_nosp_tgsmall_$test done )& @@ -104,97 +104,97 @@ steps/align_si.sh --boost-silence 1.25 --nj 10 --cmd "$train_cmd" \ # train a first delta + delta-delta triphone system on a subset of 5000 utterances steps/train_deltas.sh --boost-silence 1.25 --cmd "$train_cmd" \ - 2000 10000 data/train_5k data/lang_nosp exp/mono_ali_5k exp/tri1 || exit 1; + 2000 10000 data/train_5k data/lang_nosp exp/mono_ali_5k exp/tri1 # decode using the tri1 model ( utils/mkgraph.sh data/lang_nosp_test_tgsmall \ - exp/tri1 exp/tri1/graph_nosp_tgsmall || exit 1; + exp/tri1 exp/tri1/graph_nosp_tgsmall for test in test_clean test_other dev_clean dev_other; do steps/decode.sh --nj 20 --cmd "$decode_cmd" exp/tri1/graph_nosp_tgsmall \ - data/$test exp/tri1/decode_nosp_tgsmall_$test || exit 1; + data/$test exp/tri1/decode_nosp_tgsmall_$test steps/lmrescore.sh --cmd "$decode_cmd" data/lang_nosp_test_{tgsmall,tgmed} \ - data/$test exp/tri1/decode_nosp_{tgsmall,tgmed}_$test || exit 1; + data/$test exp/tri1/decode_nosp_{tgsmall,tgmed}_$test steps/lmrescore_const_arpa.sh \ --cmd "$decode_cmd" data/lang_nosp_test_{tgsmall,tglarge} \ - data/$test exp/tri1/decode_nosp_{tgsmall,tglarge}_$test || exit 1; + data/$test exp/tri1/decode_nosp_{tgsmall,tglarge}_$test done )& steps/align_si.sh --nj 10 --cmd "$train_cmd" \ - data/train_10k data/lang_nosp exp/tri1 exp/tri1_ali_10k || exit 1; + data/train_10k data/lang_nosp exp/tri1 exp/tri1_ali_10k # train an LDA+MLLT system. steps/train_lda_mllt.sh --cmd "$train_cmd" \ --splice-opts "--left-context=3 --right-context=3" 2500 15000 \ - data/train_10k data/lang_nosp exp/tri1_ali_10k exp/tri2b || exit 1; + data/train_10k data/lang_nosp exp/tri1_ali_10k exp/tri2b # decode using the LDA+MLLT model ( utils/mkgraph.sh data/lang_nosp_test_tgsmall \ - exp/tri2b exp/tri2b/graph_nosp_tgsmall || exit 1; + exp/tri2b exp/tri2b/graph_nosp_tgsmall for test in test_clean test_other dev_clean dev_other; do steps/decode.sh --nj 20 --cmd "$decode_cmd" exp/tri2b/graph_nosp_tgsmall \ - data/$test exp/tri2b/decode_nosp_tgsmall_$test || exit 1; + data/$test exp/tri2b/decode_nosp_tgsmall_$test steps/lmrescore.sh --cmd "$decode_cmd" data/lang_nosp_test_{tgsmall,tgmed} \ - data/$test exp/tri2b/decode_nosp_{tgsmall,tgmed}_$test || exit 1; + data/$test exp/tri2b/decode_nosp_{tgsmall,tgmed}_$test steps/lmrescore_const_arpa.sh \ --cmd "$decode_cmd" data/lang_nosp_test_{tgsmall,tglarge} \ - data/$test exp/tri2b/decode_nosp_{tgsmall,tglarge}_$test || exit 1; + data/$test exp/tri2b/decode_nosp_{tgsmall,tglarge}_$test done )& # Align a 10k utts subset using the tri2b model steps/align_si.sh --nj 10 --cmd "$train_cmd" --use-graphs true \ - data/train_10k data/lang_nosp exp/tri2b exp/tri2b_ali_10k || exit 1; + data/train_10k data/lang_nosp exp/tri2b exp/tri2b_ali_10k # Train tri3b, which is LDA+MLLT+SAT on 10k utts steps/train_sat.sh --cmd "$train_cmd" 2500 15000 \ - data/train_10k data/lang_nosp exp/tri2b_ali_10k exp/tri3b || exit 1; + data/train_10k data/lang_nosp exp/tri2b_ali_10k exp/tri3b # decode using the tri3b model ( utils/mkgraph.sh data/lang_nosp_test_tgsmall \ - exp/tri3b exp/tri3b/graph_nosp_tgsmall || exit 1; + exp/tri3b exp/tri3b/graph_nosp_tgsmall for test in test_clean test_other dev_clean dev_other; do steps/decode_fmllr.sh --nj 20 --cmd "$decode_cmd" \ exp/tri3b/graph_nosp_tgsmall data/$test \ - exp/tri3b/decode_nosp_tgsmall_$test || exit 1; + exp/tri3b/decode_nosp_tgsmall_$test steps/lmrescore.sh --cmd "$decode_cmd" data/lang_nosp_test_{tgsmall,tgmed} \ - data/$test exp/tri3b/decode_nosp_{tgsmall,tgmed}_$test || exit 1; + data/$test exp/tri3b/decode_nosp_{tgsmall,tgmed}_$test steps/lmrescore_const_arpa.sh \ --cmd "$decode_cmd" data/lang_nosp_test_{tgsmall,tglarge} \ - data/$test exp/tri3b/decode_nosp_{tgsmall,tglarge}_$test || exit 1; + data/$test exp/tri3b/decode_nosp_{tgsmall,tglarge}_$test done )& # align the entire train_clean_100 subset using the tri3b model steps/align_fmllr.sh --nj 20 --cmd "$train_cmd" \ data/train_clean_100 data/lang_nosp \ - exp/tri3b exp/tri3b_ali_clean_100 || exit 1; + exp/tri3b exp/tri3b_ali_clean_100 # train another LDA+MLLT+SAT system on the entire 100 hour subset steps/train_sat.sh --cmd "$train_cmd" 4200 40000 \ data/train_clean_100 data/lang_nosp \ - exp/tri3b_ali_clean_100 exp/tri4b || exit 1; + exp/tri3b_ali_clean_100 exp/tri4b # decode using the tri4b model ( utils/mkgraph.sh data/lang_nosp_test_tgsmall \ - exp/tri4b exp/tri4b/graph_nosp_tgsmall || exit 1; + exp/tri4b exp/tri4b/graph_nosp_tgsmall for test in test_clean test_other dev_clean dev_other; do steps/decode_fmllr.sh --nj 20 --cmd "$decode_cmd" \ exp/tri4b/graph_nosp_tgsmall data/$test \ - exp/tri4b/decode_nosp_tgsmall_$test || exit 1; + exp/tri4b/decode_nosp_tgsmall_$test steps/lmrescore.sh --cmd "$decode_cmd" data/lang_nosp_test_{tgsmall,tgmed} \ - data/$test exp/tri4b/decode_nosp_{tgsmall,tgmed}_$test || exit 1; + data/$test exp/tri4b/decode_nosp_{tgsmall,tgmed}_$test steps/lmrescore_const_arpa.sh \ --cmd "$decode_cmd" data/lang_nosp_test_{tgsmall,tglarge} \ - data/$test exp/tri4b/decode_nosp_{tgsmall,tglarge}_$test || exit 1; + data/$test exp/tri4b/decode_nosp_{tgsmall,tglarge}_$test steps/lmrescore_const_arpa.sh \ --cmd "$decode_cmd" data/lang_nosp_test_{tgsmall,fglarge} \ - data/$test exp/tri4b/decode_nosp_{tgsmall,fglarge}_$test || exit 1; + data/$test exp/tri4b/decode_nosp_{tgsmall,fglarge}_$test done )& @@ -205,137 +205,151 @@ steps/get_prons.sh --cmd "$train_cmd" \ utils/dict_dir_add_pronprobs.sh --max-normalize true \ data/local/dict_nosp \ exp/tri4b/pron_counts_nowb.txt exp/tri4b/sil_counts_nowb.txt \ - exp/tri4b/pron_bigram_counts_nowb.txt data/local/dict || exit 1 + exp/tri4b/pron_bigram_counts_nowb.txt data/local/dict utils/prepare_lang.sh data/local/dict \ - "" data/local/lang_tmp data/lang + "" data/local/lang_tmp data/lang local/format_lms.sh --src-dir data/lang data/local/lm utils/build_const_arpa_lm.sh \ - data/local/lm/lm_tglarge.arpa.gz data/lang data/lang_test_tglarge || exit 1; + data/local/lm/lm_tglarge.arpa.gz data/lang data/lang_test_tglarge utils/build_const_arpa_lm.sh \ - data/local/lm/lm_fglarge.arpa.gz data/lang data/lang_test_fglarge || exit 1; + data/local/lm/lm_fglarge.arpa.gz data/lang data/lang_test_fglarge # decode using the tri4b model with pronunciation and silence probabilities ( utils/mkgraph.sh \ - data/lang_test_tgsmall exp/tri4b exp/tri4b/graph_tgsmall || exit 1; + data/lang_test_tgsmall exp/tri4b exp/tri4b/graph_tgsmall for test in test_clean test_other dev_clean dev_other; do steps/decode_fmllr.sh --nj 20 --cmd "$decode_cmd" \ exp/tri4b/graph_tgsmall data/$test \ - exp/tri4b/decode_tgsmall_$test || exit 1; + exp/tri4b/decode_tgsmall_$test steps/lmrescore.sh --cmd "$decode_cmd" data/lang_test_{tgsmall,tgmed} \ - data/$test exp/tri4b/decode_{tgsmall,tgmed}_$test || exit 1; + data/$test exp/tri4b/decode_{tgsmall,tgmed}_$test steps/lmrescore_const_arpa.sh \ --cmd "$decode_cmd" data/lang_test_{tgsmall,tglarge} \ - data/$test exp/tri4b/decode_{tgsmall,tglarge}_$test || exit 1; + data/$test exp/tri4b/decode_{tgsmall,tglarge}_$test steps/lmrescore_const_arpa.sh \ --cmd "$decode_cmd" data/lang_test_{tgsmall,fglarge} \ - data/$test exp/tri4b/decode_{tgsmall,fglarge}_$test || exit 1; + data/$test exp/tri4b/decode_{tgsmall,fglarge}_$test done )& # align train_clean_100 using the tri4b model steps/align_fmllr.sh --nj 30 --cmd "$train_cmd" \ - data/train_clean_100 data/lang exp/tri4b exp/tri4b_ali_clean_100 || exit 1; + data/train_clean_100 data/lang exp/tri4b exp/tri4b_ali_clean_100 # if you want at this point you can train and test NN model(s) on the 100 hour # subset -local/nnet2/run_5a_clean_100.sh || exit 1 +local/nnet2/run_5a_clean_100.sh -local/download_and_untar.sh $data $data_url train-clean-360 || exit 1; +local/download_and_untar.sh $data $data_url train-clean-360 # now add the "clean-360" subset to the mix ... local/data_prep.sh \ - $data/LibriSpeech/train-clean-360 data/train_clean_360 || exit 1 + $data/LibriSpeech/train-clean-360 data/train_clean_360 steps/make_mfcc.sh --cmd "$train_cmd" --nj 40 data/train_clean_360 \ - exp/make_mfcc/train_clean_360 $mfccdir || exit 1 + exp/make_mfcc/train_clean_360 $mfccdir steps/compute_cmvn_stats.sh \ - data/train_clean_360 exp/make_mfcc/train_clean_360 $mfccdir || exit 1 + data/train_clean_360 exp/make_mfcc/train_clean_360 $mfccdir # ... and then combine the two sets into a 460 hour one utils/combine_data.sh \ - data/train_clean_460 data/train_clean_100 data/train_clean_360 || exit 1 + data/train_clean_460 data/train_clean_100 data/train_clean_360 # align the new, combined set, using the tri4b model steps/align_fmllr.sh --nj 40 --cmd "$train_cmd" \ - data/train_clean_460 data/lang exp/tri4b exp/tri4b_ali_clean_460 || exit 1; + data/train_clean_460 data/lang exp/tri4b exp/tri4b_ali_clean_460 # create a larger SAT model, trained on the 460 hours of data. steps/train_sat.sh --cmd "$train_cmd" 5000 100000 \ - data/train_clean_460 data/lang exp/tri4b_ali_clean_460 exp/tri5b || exit 1; + data/train_clean_460 data/lang exp/tri4b_ali_clean_460 exp/tri5b # decode using the tri5b model ( utils/mkgraph.sh data/lang_test_tgsmall \ - exp/tri5b exp/tri5b/graph_tgsmall || exit 1; + exp/tri5b exp/tri5b/graph_tgsmall for test in test_clean test_other dev_clean dev_other; do steps/decode_fmllr.sh --nj 20 --cmd "$decode_cmd" \ exp/tri5b/graph_tgsmall data/$test \ - exp/tri5b/decode_tgsmall_$test || exit 1; + exp/tri5b/decode_tgsmall_$test steps/lmrescore.sh --cmd "$decode_cmd" data/lang_test_{tgsmall,tgmed} \ - data/$test exp/tri5b/decode_{tgsmall,tgmed}_$test || exit 1; + data/$test exp/tri5b/decode_{tgsmall,tgmed}_$test steps/lmrescore_const_arpa.sh \ --cmd "$decode_cmd" data/lang_test_{tgsmall,tglarge} \ - data/$test exp/tri5b/decode_{tgsmall,tglarge}_$test || exit 1; + data/$test exp/tri5b/decode_{tgsmall,tglarge}_$test steps/lmrescore_const_arpa.sh \ --cmd "$decode_cmd" data/lang_test_{tgsmall,fglarge} \ - data/$test exp/tri5b/decode_{tgsmall,fglarge}_$test || exit 1; + data/$test exp/tri5b/decode_{tgsmall,fglarge}_$test done )& # train a NN model on the 460 hour set -local/nnet2/run_6a_clean_460.sh || exit 1 +local/nnet2/run_6a_clean_460.sh -local/download_and_untar.sh $data $data_url train-other-500 || exit 1; +local/download_and_untar.sh $data $data_url train-other-500 # prepare the 500 hour subset. local/data_prep.sh \ - $data/LibriSpeech/train-other-500 data/train_other_500 || exit 1 + $data/LibriSpeech/train-other-500 data/train_other_500 steps/make_mfcc.sh --cmd "$train_cmd" --nj 40 data/train_other_500 \ - exp/make_mfcc/train_other_500 $mfccdir || exit 1 + exp/make_mfcc/train_other_500 $mfccdir steps/compute_cmvn_stats.sh \ - data/train_other_500 exp/make_mfcc/train_other_500 $mfccdir || exit 1 + data/train_other_500 exp/make_mfcc/train_other_500 $mfccdir # combine all the data utils/combine_data.sh \ - data/train_960 data/train_clean_460 data/train_other_500 || exit 1 + data/train_960 data/train_clean_460 data/train_other_500 steps/align_fmllr.sh --nj 40 --cmd "$train_cmd" \ - data/train_960 data/lang exp/tri5b exp/tri5b_ali_960 || exit 1; + data/train_960 data/lang exp/tri5b exp/tri5b_ali_960 # train a SAT model on the 960 hour mixed data. Use the train_quick.sh script # as it is faster. steps/train_quick.sh --cmd "$train_cmd" \ - 7000 150000 data/train_960 data/lang exp/tri5b_ali_960 exp/tri6b || exit 1; + 7000 150000 data/train_960 data/lang exp/tri5b_ali_960 exp/tri6b # decode using the tri6b model ( utils/mkgraph.sh data/lang_test_tgsmall \ - exp/tri6b exp/tri6b/graph_tgsmall || exit 1; + exp/tri6b exp/tri6b/graph_tgsmall for test in test_clean test_other dev_clean dev_other; do steps/decode_fmllr.sh --nj 20 --cmd "$decode_cmd" \ - exp/tri6b/graph_tgsmall data/$test exp/tri6b/decode_tgsmall_$test || exit 1; + exp/tri6b/graph_tgsmall data/$test exp/tri6b/decode_tgsmall_$test steps/lmrescore.sh --cmd "$decode_cmd" data/lang_test_{tgsmall,tgmed} \ - data/$test exp/tri6b/decode_{tgsmall,tgmed}_$test || exit 1; + data/$test exp/tri6b/decode_{tgsmall,tgmed}_$test steps/lmrescore_const_arpa.sh \ --cmd "$decode_cmd" data/lang_test_{tgsmall,tglarge} \ - data/$test exp/tri6b/decode_{tgsmall,tglarge}_$test || exit 1; + data/$test exp/tri6b/decode_{tgsmall,tglarge}_$test steps/lmrescore_const_arpa.sh \ --cmd "$decode_cmd" data/lang_test_{tgsmall,fglarge} \ - data/$test exp/tri6b/decode_{tgsmall,fglarge}_$test || exit 1; + data/$test exp/tri6b/decode_{tgsmall,fglarge}_$test done )& # steps/cleanup/debug_lexicon.sh --remove-stress true --nj 200 --cmd "$train_cmd" data/train_clean_100 \ # data/lang exp/tri6b data/local/dict/lexicon.txt exp/debug_lexicon_100h -# #Perform RNNLM rescoring of tri6b +# #Perform rescoring of tri6b be means of faster-rnnlm # #Attention: with default settings requires 4 GB of memory per rescoring job, so commenting this out by default -# local/run_rnnlm.sh $data data/local/lm +# wait && local/run_rnnlm.sh \ +# --rnnlm-ver "faster-rnnlm" \ +# --rnnlm-options "-hidden 150 -direct 1000 -direct-order 5" \ +# --rnnlm-tag "h150-me5-1000" $data data/local/lm + +# #Perform rescoring of tri6b be means of faster-rnnlm using Noise contrastive estimation +# #Note, that could be extremely slow without CUDA +# #We use smaller direct layer size so that it could be stored in GPU memory (~2Gb) +# #Suprisingly, bottleneck here is validation rather then learning +# #Therefore you can use smaller validation dataset to speed up training +# wait && local/run_rnnlm.sh \ +# --rnnlm-ver "faster-rnnlm" \ +# --rnnlm-options "-hidden 150 -direct 400 -direct-order 3 --nce 20" \ +# --rnnlm-tag "h150-me3-400-nce20" $data data/local/lm + # train NN models on the entire dataset -local/nnet2/run_7a_960.sh || exit 1 +local/nnet2/run_7a_960.sh # # train models on cleaned-up data # # we've found that this isn't helpful-- see the comments in local/run_data_cleaning.sh diff --git a/egs/lre/v1/cmd.sh b/egs/lre/v1/cmd.sh index 5c38b3a5d77..d1ca1a6d126 100644 --- a/egs/lre/v1/cmd.sh +++ b/egs/lre/v1/cmd.sh @@ -1,28 +1,15 @@ -# "queue.pl" uses qsub. The options to it are -# options to qsub. If you have GridEngine installed, -# change this to a queue you have access to. -# Otherwise, use "run.pl", which will run jobs locally -# (make sure your --num-jobs options are no more than -# the number of cpus on your machine. - -#a) JHU cluster options -export train_cmd="queue.pl -l arch=*64*" -export decode_cmd="queue.pl -l arch=*64* -l ram_free=4G,mem_free=4G" -#export cuda_cmd="..." -export mkgraph_cmd="queue.pl -l arch=*64* ram_free=4G,mem_free=4G" - -#b) BUT cluster options -#export train_cmd="queue.pl -q all.q@@blade -l ram_free=1200M,mem_free=1200M" -#export decode_cmd="queue.pl -q all.q@@blade -l ram_free=1700M,mem_free=1700M" -#export decodebig_cmd="queue.pl -q all.q@@blade -l ram_free=4G,mem_free=4G" -#export cuda_cmd="queue.pl -q long.q@@pco203 -l gpu=1" -#export cuda_cmd="queue.pl -q long.q@pcspeech-gpu" -#export mkgraph_cmd="queue.pl -q all.q@@servers -l ram_free=4G,mem_free=4G" - -#c) run it locally... -#export train_cmd=run.pl -#export decode_cmd=run.pl -export cuda_cmd=run.pl -#export mkgraph_cmd=run.pl +# you can change cmd.sh depending on what type of queue you are using. +# If you have no queueing system and want to run on a local machine, you +# can change all instances 'queue.pl' to run.pl (but be careful and run +# commands one by one: most recipes will exhaust the memory on your +# machine). queue.pl works with GridEngine (qsub). slurm.pl works +# with slurm. Different queues are configured differently, with different +# queue names and different ways of specifying things like memory; +# to account for these differences you can create and edit the file +# conf/queue.conf to match your queue's configuration. Search for +# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, +# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. + +export train_cmd="queue.pl --mem 4G" diff --git a/egs/lre/v1/path.sh b/egs/lre/v1/path.sh index 7cf73af8c53..e50f57c5271 100755 --- a/egs/lre/v1/path.sh +++ b/egs/lre/v1/path.sh @@ -1,3 +1,5 @@ -export KALDI_ROOT=$(cd ../../..; pwd) -export PATH=$PWD/utils/:$KALDI_ROOT/src/bin:$KALDI_ROOT/tools/openfst/bin:$KALDI_ROOT/src/fstbin/:$KALDI_ROOT/src/gmmbin/:$KALDI_ROOT/src/featbin/:$KALDI_ROOT/src/lm/:$KALDI_ROOT/src/sgmmbin/:$KALDI_ROOT/src/sgmm2bin/:$KALDI_ROOT/src/fgmmbin/:$KALDI_ROOT/src/latbin/:$KALDI_ROOT/src/nnetbin:$KALDI_ROOT/src/nnet2bin:$KALDI_ROOT/src/ivectorbin:$KALDI_ROOT/tools/sph2pipe_v2.5:$PWD:$PATH +export KALDI_ROOT=`pwd`/../../.. +export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$KALDI_ROOT/tools/sph2pipe_v2.5:$PWD:$PATH +[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1 +. $KALDI_ROOT/tools/config/common_path.sh export LC_ALL=C diff --git a/egs/lre07/v1/cmd.sh b/egs/lre07/v1/cmd.sh index 5c38b3a5d77..d1ca1a6d126 100644 --- a/egs/lre07/v1/cmd.sh +++ b/egs/lre07/v1/cmd.sh @@ -1,28 +1,15 @@ -# "queue.pl" uses qsub. The options to it are -# options to qsub. If you have GridEngine installed, -# change this to a queue you have access to. -# Otherwise, use "run.pl", which will run jobs locally -# (make sure your --num-jobs options are no more than -# the number of cpus on your machine. - -#a) JHU cluster options -export train_cmd="queue.pl -l arch=*64*" -export decode_cmd="queue.pl -l arch=*64* -l ram_free=4G,mem_free=4G" -#export cuda_cmd="..." -export mkgraph_cmd="queue.pl -l arch=*64* ram_free=4G,mem_free=4G" - -#b) BUT cluster options -#export train_cmd="queue.pl -q all.q@@blade -l ram_free=1200M,mem_free=1200M" -#export decode_cmd="queue.pl -q all.q@@blade -l ram_free=1700M,mem_free=1700M" -#export decodebig_cmd="queue.pl -q all.q@@blade -l ram_free=4G,mem_free=4G" -#export cuda_cmd="queue.pl -q long.q@@pco203 -l gpu=1" -#export cuda_cmd="queue.pl -q long.q@pcspeech-gpu" -#export mkgraph_cmd="queue.pl -q all.q@@servers -l ram_free=4G,mem_free=4G" - -#c) run it locally... -#export train_cmd=run.pl -#export decode_cmd=run.pl -export cuda_cmd=run.pl -#export mkgraph_cmd=run.pl +# you can change cmd.sh depending on what type of queue you are using. +# If you have no queueing system and want to run on a local machine, you +# can change all instances 'queue.pl' to run.pl (but be careful and run +# commands one by one: most recipes will exhaust the memory on your +# machine). queue.pl works with GridEngine (qsub). slurm.pl works +# with slurm. Different queues are configured differently, with different +# queue names and different ways of specifying things like memory; +# to account for these differences you can create and edit the file +# conf/queue.conf to match your queue's configuration. Search for +# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, +# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. + +export train_cmd="queue.pl --mem 4G" diff --git a/egs/lre07/v1/local/make_lre07.pl b/egs/lre07/v1/local/make_lre07.pl index db29880a2f4..3dd2c089d96 100755 --- a/egs/lre07/v1/local/make_lre07.pl +++ b/egs/lre07/v1/local/make_lre07.pl @@ -40,10 +40,10 @@ open(DUR10, ">$dir/10sec") || die "Failed opening output file $dir/10sec"; open(DUR30, ">$dir/30sec") || die "Failed opening output file $dir/30sec"; -my $key_str = `wget -qO- "http://www.itl.nist.gov/iad/mig/tests/lang/2007/lid07key_v5.txt"`; +my $key_str = `wget -qO- "http://www.openslr.org/resources/23/lre07_key.txt"`; @key_lines = split("\n",$key_str); -%utt2lang = (); -%utt2dur = (); +%utt2lang = (); +%utt2dur = (); foreach (@key_lines) { @words = split(' ', $_); if (index($words[0], "#") == -1) { diff --git a/egs/lre07/v1/path.sh b/egs/lre07/v1/path.sh index 7cf73af8c53..e50f57c5271 100755 --- a/egs/lre07/v1/path.sh +++ b/egs/lre07/v1/path.sh @@ -1,3 +1,5 @@ -export KALDI_ROOT=$(cd ../../..; pwd) -export PATH=$PWD/utils/:$KALDI_ROOT/src/bin:$KALDI_ROOT/tools/openfst/bin:$KALDI_ROOT/src/fstbin/:$KALDI_ROOT/src/gmmbin/:$KALDI_ROOT/src/featbin/:$KALDI_ROOT/src/lm/:$KALDI_ROOT/src/sgmmbin/:$KALDI_ROOT/src/sgmm2bin/:$KALDI_ROOT/src/fgmmbin/:$KALDI_ROOT/src/latbin/:$KALDI_ROOT/src/nnetbin:$KALDI_ROOT/src/nnet2bin:$KALDI_ROOT/src/ivectorbin:$KALDI_ROOT/tools/sph2pipe_v2.5:$PWD:$PATH +export KALDI_ROOT=`pwd`/../../.. +export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$KALDI_ROOT/tools/sph2pipe_v2.5:$PWD:$PATH +[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1 +. $KALDI_ROOT/tools/config/common_path.sh export LC_ALL=C diff --git a/egs/lre07/v1/run.sh b/egs/lre07/v1/run.sh index ff3f8ad94cd..4579c06b523 100755 --- a/egs/lre07/v1/run.sh +++ b/egs/lre07/v1/run.sh @@ -1,10 +1,10 @@ #!/bin/bash -# Copyright 2014 David Snyder -# Daniel Povey +# Copyright 2014-2015 David Snyder +# Daniel Povey # Apache 2.0. # # This script runs the NIST 2007 General Language Recognition Closed-Set -# evaluation. +# evaluation. . cmd.sh . path.sh @@ -36,7 +36,7 @@ local/make_lre07_train.pl /export/corpora5/LDC/LDC2009S05 data local/make_lre09.pl /export/corpora5/NIST/LRE/LRE2009/eval data # Make the evaluation data set. We're concentrating on the General Language -# Recognition Closet-Set evaluation, so we remove the dialects and filter +# Recognition Closed-Set evaluation, so we remove the dialects and filter # out the unknown languages used in the open-set evaluation. local/make_lre07.pl /export/corpora5/LDC/LDC2009S04 data/lre07_all @@ -60,7 +60,8 @@ for d in $src_list; do rm -f $d/spk2gender 2>/dev/null; done utils/combine_data.sh data/train_unsplit $src_list # original utt2lang will remain in data/train_unsplit/.backup/utt2lang. -utils/apply_map.pl -f 2 --permissive local/lang_map.txt < data/train_unsplit/utt2lang 2>/dev/null > foo +utils/apply_map.pl -f 2 --permissive local/lang_map.txt \ + < data/train_unsplit/utt2lang 2>/dev/null > foo cp foo data/train_unsplit/utt2lang rm foo @@ -70,9 +71,9 @@ echo "**Language count in i-Vector extractor training (after splitting long utte awk '{print $2}' data/train/utt2lang | sort | uniq -c | sort -nr # This commented script is an alternative to the above utterance -# splitting method. Here we split the utterance based on the number of +# splitting method. Here we split the utterance based on the number of # frames which are voiced, rather than the total number of frames. -# max_voiced=3000 +# max_voiced=3000 # local/vad_split_utts.sh --max-voiced $max_voiced data/train_unsplit $mfccdir data/train use_vtln=true @@ -81,7 +82,7 @@ if $use_vtln; then cp -r data/${t} data/${t}_novtln rm -r data/${t}_novtln/{split,.backup,spk2warp} 2>/dev/null || true steps/make_mfcc.sh --mfcc-config conf/mfcc_vtln.conf --nj 100 --cmd "$train_cmd" \ - data/${t}_novtln exp/make_mfcc $mfccdir + data/${t}_novtln exp/make_mfcc $mfccdir lid/compute_vad_decision.sh data/${t}_novtln exp/make_mfcc $mfccdir done @@ -98,7 +99,7 @@ if $use_vtln; then data/train_5k_novtln exp/diag_ubm_vtln exp/vtln for t in lre07 train; do - lid/get_vtln_warps.sh --nj 100 --cmd "$train_cmd" \ + lid/get_vtln_warps.sh --nj 50 --cmd "$train_cmd" \ data/${t}_novtln exp/vtln exp/${t}_warps cp exp/${t}_warps/utt2warp data/$t/ done @@ -126,18 +127,18 @@ utils/subset_data_dir.sh data/train 5000 data/train_5k utils/subset_data_dir.sh data/train 10000 data/train_10k -lid/train_diag_ubm.sh --nj 30 --cmd "$train_cmd" data/train_5k 2048 \ - exp/diag_ubm_2048 -lid/train_full_ubm.sh --nj 30 --cmd "$train_cmd" data/train_10k \ - exp/diag_ubm_2048 exp/full_ubm_2048_10k +lid/train_diag_ubm.sh --nj 30 --cmd "$train_cmd -l mem_free=20G,ram_free=20G" \ + data/train_5k 2048 exp/diag_ubm_2048 +lid/train_full_ubm.sh --nj 30 --cmd "$train_cmd -l mem_free=20G,ram_free=20G" \ + data/train_10k exp/diag_ubm_2048 exp/full_ubm_2048_10k -lid/train_full_ubm.sh --nj 30 --cmd "$train_cmd" data/train \ - exp/full_ubm_2048_10k exp/full_ubm_2048 +lid/train_full_ubm.sh --nj 30 --cmd "$train_cmd -l mem_free=35G,ram_free=35G" \ + data/train exp/full_ubm_2048_10k exp/full_ubm_2048 # Alternatively, a diagonal UBM can replace the full UBM used above. # The preceding calls to train_diag_ubm.sh and train_full_ubm.sh # can be commented out and replaced with the following lines. -# +# # This results in a slight degradation but could improve error rate when # there is less training data than used in this example. # @@ -147,7 +148,8 @@ lid/train_full_ubm.sh --nj 30 --cmd "$train_cmd" data/train \ #gmm-global-to-fgmm exp/diag_ubm_2048/final.dubm \ # exp/full_ubm_2048/final.ubm -lid/train_ivector_extractor.sh --cmd "$train_cmd -l mem_free=8G,ram_free=8G" \ +lid/train_ivector_extractor.sh --cmd "$train_cmd -l mem_free=35G,ram_free=35G" \ + --use-weights true \ --num-iters 5 exp/full_ubm_2048/final.ubm data/train \ exp/extractor_2048 @@ -167,13 +169,13 @@ lid/extract_ivectors.sh --cmd "$train_cmd -l mem_free=3G,ram_free=3G" --nj 50 \ exp/extractor_2048 data/lre07 exp/ivectors_lre07 lid/run_logistic_regression.sh --prior-scale 0.70 \ - --conf conf/logistic-regression.conf + --conf conf/logistic-regression.conf # Training error-rate -# ER (%): 5.15 +# ER (%): 3.95 # General LR 2007 closed-set eval local/lre07_eval/lre07_eval.sh exp/ivectors_lre07 \ local/general_lr_closed_set_langs.txt # Duration (sec): avg 3 10 30 -# ER (%): 23.58 43.95 19.43 7.37 -# C_avg (%): 14.79 27.23 12.16 4.97 +# ER (%): 23.11 42.84 19.33 7.18 +# C_avg (%): 14.17 26.04 11.93 4.52 diff --git a/egs/reverb/s5/RESULTS b/egs/reverb/s5/RESULTS index 031a6b2ec1a..3537852a827 100644 --- a/egs/reverb/s5/RESULTS +++ b/egs/reverb/s5/RESULTS @@ -1,306 +1,150 @@ -local/summarize_results.pl tri2a -#### RESULTS FOR dt ##### - -exp/tri2a/decode_bg_5k_REVERB_dt* -LMW = 15 -RealData_dt_for_1ch_far_room1_A 89.00 -RealData_dt_for_1ch_near_room1_A 90.39 -SimData_dt_for_1ch_far_room1_A 22.35 -SimData_dt_for_1ch_far_room2_A 88.37 -SimData_dt_for_1ch_far_room3_A 90.85 -SimData_dt_for_1ch_near_room1_A 12.29 -SimData_dt_for_1ch_near_room2_A 42.86 -SimData_dt_for_1ch_near_room3_A 50.17 -Avg_Sim(6) 51.15 -Avg_Real(2) 89.69 - - -#### RESULTS FOR et ##### - -exp/tri2a/decode_bg_5k_REVERB_et* -LMW = 15 -Avg_Sim(0) 0.00 -Avg_Real(0) 0.00 - - -local/summarize_results.pl tri2a_mc -#### RESULTS FOR dt ##### - -exp/tri2a_mc/decode_bg_5k_REVERB_dt* -LMW = 15 -RealData_dt_for_1ch_far_room1_A 51.88 -RealData_dt_for_1ch_near_room1_A 56.14 -SimData_dt_for_1ch_far_room1_A 17.45 -SimData_dt_for_1ch_far_room2_A 44.02 -SimData_dt_for_1ch_far_room3_A 49.90 -SimData_dt_for_1ch_near_room1_A 15.29 -SimData_dt_for_1ch_near_room2_A 22.11 -SimData_dt_for_1ch_near_room3_A 26.34 -Avg_Sim(6) 29.18 -Avg_Real(2) 54.01 - - -#### RESULTS FOR et ##### - -exp/tri2a_mc/decode_bg_5k_REVERB_et* -LMW = 15 -Avg_Sim(0) 0.00 -Avg_Real(0) 0.00 - - -local/summarize_results.pl tri2a_mc basis_fmllr -#### RESULTS FOR dt ##### - -exp/tri2a_mc/decode_basis_fmllr_bg_5k_REVERB_dt* -LMW = 15 -RealData_dt_for_1ch_far_room1_A 43.95 -RealData_dt_for_1ch_near_room1_A 48.91 -SimData_dt_for_1ch_far_room1_A 16.37 -SimData_dt_for_1ch_far_room2_A 35.67 -SimData_dt_for_1ch_far_room3_A 39.59 -SimData_dt_for_1ch_near_room1_A 13.03 -SimData_dt_for_1ch_near_room2_A 17.08 -SimData_dt_for_1ch_near_room3_A 20.00 +#################### +exp/tri2a/decode_bg_5k_REVERB_*dt* +RealData_dt_for_1ch_far_room1_A 89.13 +RealData_dt_for_1ch_near_room1_A 90.27 +SimData_dt_for_1ch_far_room1_A 22.44 +SimData_dt_for_1ch_far_room2_A 88.44 +SimData_dt_for_1ch_far_room3_A 91.27 +SimData_dt_for_1ch_near_room1_A 12.19 +SimData_dt_for_1ch_near_room2_A 42.74 +SimData_dt_for_1ch_near_room3_A 49.31 +Avg_Real(2) 89.70 +Avg_Sim(6) 51.06 + +exp/tri2a/decode_bg_5k_REVERB_*et* +RealData_et_for_1ch_far_room1_A 88.45 +RealData_et_for_1ch_near_room1_A 88.66 +SimData_et_for_1ch_far_room1_A 22.72 +SimData_et_for_1ch_far_room2_A 81.53 +SimData_et_for_1ch_far_room3_A 89.25 +SimData_et_for_1ch_near_room1_A 14.37 +SimData_et_for_1ch_near_room2_A 40.46 +SimData_et_for_1ch_near_room3_A 51.50 +Avg_Real(2) 88.56 +Avg_Sim(6) 49.97 + +#################### +exp/tri2a_mc/decode_bg_5k_REVERB_*dt* +RealData_dt_for_1ch_far_room1_A 53.38 +RealData_dt_for_1ch_near_room1_A 56.27 +SimData_dt_for_1ch_far_room1_A 16.96 +SimData_dt_for_1ch_far_room2_A 44.15 +SimData_dt_for_1ch_far_room3_A 49.88 +SimData_dt_for_1ch_near_room1_A 15.00 +SimData_dt_for_1ch_near_room2_A 21.81 +SimData_dt_for_1ch_near_room3_A 25.10 +Avg_Real(2) 54.83 +Avg_Sim(6) 28.82 + +exp/tri2a_mc/decode_bg_5k_REVERB_*et* +RealData_et_for_1ch_far_room1_A 52.94 +RealData_et_for_1ch_near_room1_A 55.35 +SimData_et_for_1ch_far_room1_A 18.91 +SimData_et_for_1ch_far_room2_A 37.33 +SimData_et_for_1ch_far_room3_A 46.69 +SimData_et_for_1ch_near_room1_A 17.77 +SimData_et_for_1ch_near_room2_A 21.23 +SimData_et_for_1ch_near_room3_A 26.17 +Avg_Real(2) 54.14 +Avg_Sim(6) 28.02 + +#################### +exp/tri2a_mc/decode_basis_fmllr_bg_5k_REVERB_*dt* +RealData_dt_for_1ch_far_room1_A 46.27 +RealData_dt_for_1ch_near_room1_A 48.85 +SimData_dt_for_1ch_far_room1_A 15.59 +SimData_dt_for_1ch_far_room2_A 35.86 +SimData_dt_for_1ch_far_room3_A 39.54 +SimData_dt_for_1ch_near_room1_A 12.78 +SimData_dt_for_1ch_near_room2_A 17.75 +SimData_dt_for_1ch_near_room3_A 20.23 +Avg_Real(2) 47.56 Avg_Sim(6) 23.62 -Avg_Real(2) 46.43 - - -#### RESULTS FOR et ##### - -exp/tri2a_mc/decode_basis_fmllr_bg_5k_REVERB_et* -LMW = 15 -Avg_Sim(0) 0.00 -Avg_Real(0) 0.00 - - -local/summarize_results.pl tri2b -#### RESULTS FOR dt ##### - -exp/tri2b/decode_bg_5k_REVERB_dt* -LMW = 15 -RealData_dt_for_1ch_far_room1_A 91.66 -RealData_dt_for_1ch_near_room1_A 91.33 -SimData_dt_for_1ch_far_room1_A 26.94 -SimData_dt_for_1ch_far_room2_A 85.63 -SimData_dt_for_1ch_far_room3_A 91.99 -SimData_dt_for_1ch_near_room1_A 11.95 -SimData_dt_for_1ch_near_room2_A 34.51 -SimData_dt_for_1ch_near_room3_A 44.81 -Avg_Sim(6) 49.30 -Avg_Real(2) 91.50 - - -#### RESULTS FOR et ##### - -exp/tri2b/decode_bg_5k_REVERB_et* -LMW = 15 -RealData_et_for_1ch_far_room1_A 91.29 -RealData_et_for_1ch_near_room1_A 92.05 -SimData_et_for_1ch_far_room1_A 24.16 -SimData_et_for_1ch_far_room2_A 78.57 -SimData_et_for_1ch_far_room3_A 91.01 -SimData_et_for_1ch_near_room1_A 13.76 -SimData_et_for_1ch_near_room2_A 32.94 -SimData_et_for_1ch_near_room3_A 48.24 -Avg_Sim(6) 48.11 -Avg_Real(2) 91.67 - - -local/summarize_results.pl tri2b_mc -#### RESULTS FOR dt ##### - -exp/tri2b_mc/decode_bg_5k_REVERB_dt* -LMW = 15 -RealData_dt_for_1ch_far_room1_A 45.18 -RealData_dt_for_1ch_near_room1_A 49.91 -SimData_dt_for_1ch_far_room1_A 15.78 -SimData_dt_for_1ch_far_room2_A 34.75 -SimData_dt_for_1ch_far_room3_A 37.56 -SimData_dt_for_1ch_near_room1_A 13.45 -SimData_dt_for_1ch_near_room2_A 17.57 -SimData_dt_for_1ch_near_room3_A 19.49 -Avg_Sim(6) 23.10 -Avg_Real(2) 47.55 - - -#### RESULTS FOR et ##### - -exp/tri2b_mc/decode_bg_5k_REVERB_et* -LMW = 15 -RealData_et_for_1ch_far_room1_A 47.67 -RealData_et_for_1ch_near_room1_A 50.65 -SimData_et_for_1ch_far_room1_A 16.69 -SimData_et_for_1ch_far_room2_A 30.36 -SimData_et_for_1ch_far_room3_A 38.08 -SimData_et_for_1ch_near_room1_A 15.67 -SimData_et_for_1ch_near_room2_A 17.71 -SimData_et_for_1ch_near_room3_A 20.10 -Avg_Sim(6) 23.10 -Avg_Real(2) 49.16 - - -local/summarize_results.pl tri2b_mc basis_fmllr -#### RESULTS FOR dt ##### - -exp/tri2b_mc/decode_basis_fmllr_bg_5k_REVERB_dt* -LMW = 15 -RealData_dt_for_1ch_far_room1_A 39.37 -RealData_dt_for_1ch_near_room1_A 42.48 -SimData_dt_for_1ch_far_room1_A 14.11 -SimData_dt_for_1ch_far_room2_A 28.81 -SimData_dt_for_1ch_far_room3_A 31.53 -SimData_dt_for_1ch_near_room1_A 11.18 -SimData_dt_for_1ch_near_room2_A 15.01 -SimData_dt_for_1ch_near_room3_A 15.48 -Avg_Sim(6) 19.35 -Avg_Real(2) 40.92 - - -#### RESULTS FOR et ##### - -exp/tri2b_mc/decode_basis_fmllr_bg_5k_REVERB_et* -LMW = 15 -RealData_et_for_1ch_far_room1_A 42.03 -RealData_et_for_1ch_near_room1_A 43.53 -SimData_et_for_1ch_far_room1_A 13.87 -SimData_et_for_1ch_far_room2_A 26.02 -SimData_et_for_1ch_far_room3_A 32.80 -SimData_et_for_1ch_near_room1_A 12.42 -SimData_et_for_1ch_near_room2_A 14.82 -SimData_et_for_1ch_near_room3_A 17.02 -Avg_Sim(6) 19.49 -Avg_Real(2) 42.78 - - -local/summarize_results.pl tri2b_mc_mmi_b0.1 -#### RESULTS FOR dt ##### - -exp/tri2b_mc_mmi_b0.1/decode_bg_5k_REVERB_dt* -LMW = 15 -RealData_dt_for_1ch_far_room1_A 43.06 -RealData_dt_for_1ch_near_room1_A 46.04 -SimData_dt_for_1ch_far_room1_A 13.59 -SimData_dt_for_1ch_far_room2_A 29.55 -SimData_dt_for_1ch_far_room3_A 32.52 -SimData_dt_for_1ch_near_room1_A 11.21 -SimData_dt_for_1ch_near_room2_A 15.23 -SimData_dt_for_1ch_near_room3_A 16.42 -Avg_Sim(6) 19.75 -Avg_Real(2) 44.55 - - -#### RESULTS FOR et ##### - -exp/tri2b_mc_mmi_b0.1/decode_bg_5k_REVERB_et* -LMW = 15 -RealData_et_for_1ch_far_room1_A 43.45 -RealData_et_for_1ch_near_room1_A 46.89 -SimData_et_for_1ch_far_room1_A 13.37 -SimData_et_for_1ch_far_room2_A 25.96 -SimData_et_for_1ch_far_room3_A 31.73 -SimData_et_for_1ch_near_room1_A 11.89 -SimData_et_for_1ch_near_room2_A 14.64 -SimData_et_for_1ch_near_room3_A 17.26 -Avg_Sim(6) 19.14 -Avg_Real(2) 45.17 - - -local/summarize_results.pl tri2b_mc_mmi_b0.1 basis_fmllr -#### RESULTS FOR dt ##### - -exp/tri2b_mc_mmi_b0.1/decode_basis_fmllr_bg_5k_REVERB_dt* -LMW = 15 -RealData_dt_for_1ch_far_room1_A 36.98 -RealData_dt_for_1ch_near_room1_A 39.68 -SimData_dt_for_1ch_far_room1_A 11.43 -SimData_dt_for_1ch_far_room2_A 25.24 -SimData_dt_for_1ch_far_room3_A 27.77 -SimData_dt_for_1ch_near_room1_A 9.19 -SimData_dt_for_1ch_near_room2_A 12.77 -SimData_dt_for_1ch_near_room3_A 13.30 -Avg_Sim(6) 16.62 -Avg_Real(2) 38.33 - - -#### RESULTS FOR et ##### - -exp/tri2b_mc_mmi_b0.1/decode_basis_fmllr_bg_5k_REVERB_et* -LMW = 15 -RealData_et_for_1ch_far_room1_A 38.93 -RealData_et_for_1ch_near_room1_A 39.51 -SimData_et_for_1ch_far_room1_A 11.32 -SimData_et_for_1ch_far_room2_A 22.31 -SimData_et_for_1ch_far_room3_A 28.40 -SimData_et_for_1ch_near_room1_A 9.69 -SimData_et_for_1ch_near_room2_A 12.36 -SimData_et_for_1ch_near_room3_A 14.77 -Avg_Sim(6) 16.47 -Avg_Real(2) 39.22 - - -local/summarize_results.pl tri2b_mc_mmi_b0.1 basis_fmllr -#### RESULTS FOR dt ##### - -exp/tri2b_mc_mmi_b0.1/decode_basis_fmllr_tg_5k_REVERB_dt* -LMW = 15 -RealData_dt_for_1ch_far_room1_A 31.58 -RealData_dt_for_1ch_near_room1_A 32.00 -SimData_dt_for_1ch_far_room1_A 8.51 -SimData_dt_for_1ch_far_room2_A 18.36 -SimData_dt_for_1ch_far_room3_A 20.40 -SimData_dt_for_1ch_near_room1_A 6.47 -SimData_dt_for_1ch_near_room2_A 9.61 -SimData_dt_for_1ch_near_room3_A 9.59 -Avg_Sim(6) 12.16 -Avg_Real(2) 31.79 - - -#### RESULTS FOR et ##### - -exp/tri2b_mc_mmi_b0.1/decode_basis_fmllr_tg_5k_REVERB_et* -LMW = 15 -RealData_et_for_1ch_far_room1_A 30.32 -RealData_et_for_1ch_near_room1_A 32.45 -SimData_et_for_1ch_far_room1_A 7.74 -SimData_et_for_1ch_far_room2_A 17.01 -SimData_et_for_1ch_far_room3_A 21.05 -SimData_et_for_1ch_near_room1_A 7.01 -SimData_et_for_1ch_near_room2_A 9.52 -SimData_et_for_1ch_near_room3_A 11.29 -Avg_Sim(6) 12.27 -Avg_Real(2) 31.39 - - -local/summarize_results.pl tri2b_mc_mmi_b0.1 mbr_basis_fmllr -#### RESULTS FOR dt ##### - -exp/tri2b_mc_mmi_b0.1/decode_mbr_basis_fmllr_tg_5k_REVERB_dt* -LMW = 15 -RealData_dt_for_1ch_far_room1_A 30.96 -RealData_dt_for_1ch_near_room1_A 30.88 -SimData_dt_for_1ch_far_room1_A 8.33 -SimData_dt_for_1ch_far_room2_A 18.14 -SimData_dt_for_1ch_far_room3_A 20.15 -SimData_dt_for_1ch_near_room1_A 6.24 -SimData_dt_for_1ch_near_room2_A 9.47 -SimData_dt_for_1ch_near_room3_A 9.62 -Avg_Sim(6) 11.99 -Avg_Real(2) 30.92 - - -#### RESULTS FOR et ##### - -exp/tri2b_mc_mmi_b0.1/decode_mbr_basis_fmllr_tg_5k_REVERB_et* -LMW = 15 -RealData_et_for_1ch_far_room1_A 29.37 -RealData_et_for_1ch_near_room1_A 31.84 -SimData_et_for_1ch_far_room1_A 7.64 -SimData_et_for_1ch_far_room2_A 16.86 -SimData_et_for_1ch_far_room3_A 20.59 -SimData_et_for_1ch_near_room1_A 6.93 -SimData_et_for_1ch_near_room2_A 9.48 -SimData_et_for_1ch_near_room3_A 11.19 -Avg_Sim(6) 12.11 -Avg_Real(2) 30.61 +exp/tri2a_mc/decode_basis_fmllr_bg_5k_REVERB_*et* +RealData_et_for_1ch_far_room1_A 48.11 +RealData_et_for_1ch_near_room1_A 48.42 +SimData_et_for_1ch_far_room1_A 16.57 +SimData_et_for_1ch_far_room2_A 31.54 +SimData_et_for_1ch_far_room3_A 39.32 +SimData_et_for_1ch_near_room1_A 14.31 +SimData_et_for_1ch_near_room2_A 18.42 +SimData_et_for_1ch_near_room3_A 21.03 +Avg_Real(2) 48.27 +Avg_Sim(6) 23.53 + +#################### +exp/tri2b_mc/decode_basis_fmllr_tg_5k_REVERB_*dt* +RealData_dt_for_1ch_far_room1_A 34.04 +RealData_dt_for_1ch_near_room1_A 33.37 +SimData_dt_for_1ch_far_room1_A 10.57 +SimData_dt_for_1ch_far_room2_A 22.63 +SimData_dt_for_1ch_far_room3_A 25.00 +SimData_dt_for_1ch_near_room1_A 7.57 +SimData_dt_for_1ch_near_room2_A 10.97 +SimData_dt_for_1ch_near_room3_A 12.59 +Avg_Real(2) 33.70 +Avg_Sim(6) 14.89 + +exp/tri2b_mc/decode_basis_fmllr_tg_5k_REVERB_*et* +RealData_et_for_1ch_far_room1_A 33.49 +RealData_et_for_1ch_near_room1_A 34.72 +SimData_et_for_1ch_far_room1_A 10.03 +SimData_et_for_1ch_far_room2_A 20.16 +SimData_et_for_1ch_far_room3_A 25.08 +SimData_et_for_1ch_near_room1_A 8.45 +SimData_et_for_1ch_near_room2_A 11.16 +SimData_et_for_1ch_near_room3_A 12.88 +Avg_Real(2) 34.11 +Avg_Sim(6) 14.63 + +#################### +exp/tri2b_mc_mmi_b0.1/decode_basis_fmllr_tg_5k_REVERB_*dt* +RealData_dt_for_1ch_far_room1_A 31.17 +RealData_dt_for_1ch_near_room1_A 31.82 +SimData_dt_for_1ch_far_room1_A 8.53 +SimData_dt_for_1ch_far_room2_A 17.43 +SimData_dt_for_1ch_far_room3_A 21.04 +SimData_dt_for_1ch_near_room1_A 6.78 +SimData_dt_for_1ch_near_room2_A 8.97 +SimData_dt_for_1ch_near_room3_A 10.01 +Avg_Real(2) 31.50 +Avg_Sim(6) 12.13 + +exp/tri2b_mc_mmi_b0.1/decode_basis_fmllr_tg_5k_REVERB_*et* +RealData_et_for_1ch_far_room1_A 31.20 +RealData_et_for_1ch_near_room1_A 30.98 +SimData_et_for_1ch_far_room1_A 8.42 +SimData_et_for_1ch_far_room2_A 17.63 +SimData_et_for_1ch_far_room3_A 20.71 +SimData_et_for_1ch_near_room1_A 7.03 +SimData_et_for_1ch_near_room2_A 9.50 +SimData_et_for_1ch_near_room3_A 11.11 +Avg_Real(2) 31.09 +Avg_Sim(6) 12.40 + +#################### +exp/tri2b_mc_mmi_b0.1/decode_mbr_basis_fmllr_tg_5k_REVERB_*dt* +RealData_dt_for_1ch_far_room1_A 30.42 +RealData_dt_for_1ch_near_room1_A 31.50 +SimData_dt_for_1ch_far_room1_A 8.24 +SimData_dt_for_1ch_far_room2_A 17.25 +SimData_dt_for_1ch_far_room3_A 20.72 +SimData_dt_for_1ch_near_room1_A 6.76 +SimData_dt_for_1ch_near_room2_A 8.87 +SimData_dt_for_1ch_near_room3_A 9.92 +Avg_Real(2) 30.96 +Avg_Sim(6) 11.96 + +exp/tri2b_mc_mmi_b0.1/decode_mbr_basis_fmllr_tg_5k_REVERB_*et* +RealData_et_for_1ch_far_room1_A 30.89 +RealData_et_for_1ch_near_room1_A 31.01 +SimData_et_for_1ch_far_room1_A 8.20 +SimData_et_for_1ch_far_room2_A 17.34 +SimData_et_for_1ch_far_room3_A 20.56 +SimData_et_for_1ch_near_room1_A 6.91 +SimData_et_for_1ch_near_room2_A 9.50 +SimData_et_for_1ch_near_room3_A 10.93 +Avg_Real(2) 30.95 +Avg_Sim(6) 12.24 diff --git a/egs/reverb/s5/cmd.sh b/egs/reverb/s5/cmd.sh index e88b07e1195..71dd849a93b 100644 --- a/egs/reverb/s5/cmd.sh +++ b/egs/reverb/s5/cmd.sh @@ -1,29 +1,15 @@ -# "queue.pl" uses qsub. The options to it are -# options to qsub. If you have GridEngine installed, -# change this to a queue you have access to. -# Otherwise, use "run.pl", which will run jobs locally -# (make sure your --num-jobs options are no more than -# the number of cpus on your machine. - -#a) JHU cluster options -export train_cmd="queue.pl -l arch=*64,gpu=1 -q g.q" -export decode_cmd="queue.pl -l arch=*64,mem_free=2G,ram_free=2G" -export mkgraph_cmd="queue.pl -l arch=*64,ram_free=4G,mem_free=4G" - -#export cuda_cmd="..." - - -#b) BUT cluster options -#export train_cmd="queue.pl -q all.q@@blade -l ram_free=1200M,mem_free=1200M" -#export decode_cmd="queue.pl -q all.q@@blade -l ram_free=1700M,mem_free=1700M" -#export decodebig_cmd="queue.pl -q all.q@@blade -l ram_free=4G,mem_free=4G" - -#export cuda_cmd="queue.pl -q long.q@@pco203 -l gpu=1" -#export cuda_cmd="queue.pl -q long.q@pcspeech-gpu" -#export mkgraph_cmd="queue.pl -q all.q@@servers -l ram_free=4G,mem_free=4G" - -#c) run it locally... -#export train_cmd=run.pl -#export decode_cmd=run.pl -#export cuda_cmd=run.pl -#export mkgraph_cmd=run.pl +# you can change cmd.sh depending on what type of queue you are using. +# If you have no queueing system and want to run on a local machine, you +# can change all instances 'queue.pl' to run.pl (but be careful and run +# commands one by one: most recipes will exhaust the memory on your +# machine). queue.pl works with GridEngine (qsub). slurm.pl works +# with slurm. Different queues are configured differently, with different +# queue names and different ways of specifying things like memory; +# to account for these differences you can create and edit the file +# conf/queue.conf to match your queue's configuration. Search for +# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, +# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. + +export train_cmd="queue.pl --mem 2G" +export decode_cmd="queue.pl --mem 4G" +export mkgraph_cmd="queue.pl --mem 8G" diff --git a/egs/reverb/s5/corpus.sh b/egs/reverb/s5/corpus.sh deleted file mode 100644 index 32a2ee4b85b..00000000000 --- a/egs/reverb/s5/corpus.sh +++ /dev/null @@ -1,17 +0,0 @@ -if [[ $(hostname -f) == *.clsp.jhu.edu ]] ; then - REVERB_home=/export/corpora5/REVERB_2014/REVERB - export wsjcam0=/export/corpora3/LDC/LDC95S24/wsjcam0 - # set LDC WSJ0 directory to obtain LMs - # REVERB data directory only provides bi-gram (bcb05cnp), but this recipe also uses 3-gram (tcb05cnp.z) - export wsj0=/export/corpora5/LDC/LDC93S6A/11-13.1 #LDC93S6A or LDC93S6B - # It is assumed that there will be a 'wsj0' subdirectory - # within the top-level corpus directory -else - echo "Set the data directory locations." && exit 1; -fi - -export reverb_dt=$REVERB_home/REVERB_WSJCAM0_dt -export reverb_et=$REVERB_home/REVERB_WSJCAM0_et -export reverb_real_dt=$REVERB_home/MC_WSJ_AV_Dev -export reverb_real_et=$REVERB_home/MC_WSJ_AV_Eval - diff --git a/egs/reverb/s5/local/Generate_mcTrainData_cut.m b/egs/reverb/s5/local/Generate_mcTrainData_cut.m old mode 100644 new mode 100755 diff --git a/egs/reverb/s5/local/REVERB_mcwsjav_data_prep.sh b/egs/reverb/s5/local/REVERB_mcwsjav_data_prep.sh index c3de2ba7fd3..a4599f97702 100755 --- a/egs/reverb/s5/local/REVERB_mcwsjav_data_prep.sh +++ b/egs/reverb/s5/local/REVERB_mcwsjav_data_prep.sh @@ -65,8 +65,8 @@ if [ ! -z "$3" ]; then dt_or_x=$3 fi -# unfortunately, we need a pointer to HTK baseline -# since the corpus does NOT contain the data set descriptions +# unfortunately, we need a pointer to HTK baseline +# since the corpus does NOT contain the data set descriptions # for the REVERB Challenge taskFileDir=$dir/../reverb_tools/ReleasePackage/reverb_tools_for_asr_ver2.0/taskFiles/1ch @@ -97,11 +97,11 @@ s/\x0D$//' \ # e.g. yield' --> yield # reason: YIELD' is not in dict, while YIELD is s/YIELD'/YIELD/g - s/'ROOTS'/ROOTS/g - s/'WHERE/WHERE/g + s/'ROOTS'/ROOTS/g + s/'WHERE/WHERE/g s/PEOPLE'/PEOPLE/g s/SIT'/SIT/g - s/'DOMINEE/DOMINEE/g + s/'DOMINEE/DOMINEE/g s/CHURCH'/CHURCH/g" \ -e ' # fix the single missing double full stop issue at the end of an utterance @@ -110,9 +110,9 @@ s/\x0D$//' \ /^[A-Z]$/ { # append a line N - # search for single dot on the second line + # search for single dot on the second line /\n\./ { - # found it - now replace the + # found it - now replace the s/\([A-Z]\)\n\./\1\.\n\./ } }' \ @@ -156,9 +156,9 @@ echo "Data preparation for $set succeeded" mfccdir=mfcc/$dataset -#for x in test_eval92_clean test_eval92_5k_clean dev_dt_05_clean dev_dt_20_clean train_si84_clean; do -#for x in si_tr; do -steps/make_mfcc.sh --nj 10 \ +#for x in test_eval92_clean test_eval92_5k_clean dev_dt_05_clean dev_dt_20_clean train_si84_clean; do +#for x in si_tr; do +steps/make_mfcc.sh --cmd "$train_cmd" --nj 10 \ data/$dataset/$set exp/make_mfcc/$dataset/$set $mfccdir || exit 1; steps/compute_cmvn_stats.sh data/$dataset/$set exp/make_mfcc/$dataset/$set $mfccdir || exit 1; diff --git a/egs/reverb/s5/local/REVERB_wsjcam0_data_prep.sh b/egs/reverb/s5/local/REVERB_wsjcam0_data_prep.sh index 2c169e84b59..6ab2f2f4b73 100755 --- a/egs/reverb/s5/local/REVERB_wsjcam0_data_prep.sh +++ b/egs/reverb/s5/local/REVERB_wsjcam0_data_prep.sh @@ -50,8 +50,8 @@ fi cd $dir MIC=primary -# unfortunately, we need a pointer to HTK baseline -# since the corpus does NOT contain the data set descriptions +# unfortunately, we need a pointer to HTK baseline +# since the corpus does NOT contain the data set descriptions # for the REVERB Challenge taskFileDir=$dir/../reverb_tools/ReleasePackage/reverb_tools_for_asr_ver2.0/taskFiles/1ch #taskFiles=`ls $taskFileDir/*Data_dt_for_*` @@ -108,9 +108,9 @@ echo "Data preparation for $set succeeded" mfccdir=mfcc/$dataset -#for x in test_eval92_clean test_eval92_5k_clean dev_dt_05_clean dev_dt_20_clean train_si84_clean; do -#for x in si_tr; do -steps/make_mfcc.sh --nj 10 \ +#for x in test_eval92_clean test_eval92_5k_clean dev_dt_05_clean dev_dt_20_clean train_si84_clean; do +#for x in si_tr; do +steps/make_mfcc.sh --cmd "$train_cmd" --nj 10 \ data/$dataset/$set exp/make_mfcc/$dataset/$set $mfccdir || exit 1; steps/compute_cmvn_stats.sh data/$dataset/$set exp/make_mfcc/$dataset/$set $mfccdir || exit 1; diff --git a/egs/reverb/s5/local/calc_wer.sh b/egs/reverb/s5/local/calc_wer.sh new file mode 100755 index 00000000000..c4b5eeb87f3 --- /dev/null +++ b/egs/reverb/s5/local/calc_wer.sh @@ -0,0 +1,55 @@ +#!/bin/bash + +# Copyright 2016 MERL (author: Shinji Watanabe) + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +# MERCHANTABLITY OR NON-INFRINGEMENT. +# See the Apache 2 License for the specific language governing permissions and +# limitations under the License. + +. ./cmd.sh +. ./path.sh + +lmw=15 +am="tri2a" +lm="bg_5k" +decode="" + +. utils/parse_options.sh + +if [ ! -z $decode ]; then + decode="_$decode" +fi + +dir="exp/$am/decode${decode}_${lm}_REVERB_" +echo "####################" +echo "${dir}*dt*" +for a in `echo ${dir}*dt* | tr " " "\n" | grep -v "A\.si"`; do + echo $a | awk -F '_' '{for(i=NF-6;i -1) { - if ($ARGV[0] =~ /^--lmw=(\d+)$/) - { - $opt_lmw = $1 + 0; - shift @ARGV; - } - elsif ($ARGV[0] =~ /^--lm=(\w+)$/) { - $lm = $1; - shift @ARGV; - } - else { - last; - } -} - - -print "$0 @ARGV\n"; - -my $system = "tri2b_mc"; -if ($ARGV[0] ne "") { $system = $ARGV[0]; } - -for my $dt_or_et ("dt", "et") { - -print "#### RESULTS FOR $dt_or_et ##### \n\n"; - -my $pref = "REVERB_$dt_or_et"; -#if ($lm ne "bg_5k") { -$pref = "${lm}_$pref"; -#} -if ($ARGV[1] ne "") { $pref = $ARGV[1] . '_' . $pref; } -if ($ARGV[2] ne "") { $pref = $pref . '_' . $ARGV[2]; } - -my $suff = ""; - -print "exp/$system/decode_$suff$pref*\n"; -my @folders = glob("exp/$system/decode_$suff$pref*"); - -my ($min_lmw, $max_lmw) = (9, 20); -@folders = grep { -f "$_/wer_$min_lmw" } @folders; -my @sum_wer; -my %wer; -my %avg_wer_disp; -my $nc = 0; -my $ns = 0; -my $nr = 0; -for my $lmw ($min_lmw..$max_lmw) -{ - for my $fold (@folders) { - my $res_file = "$fold/wer_$lmw"; - #print "fold = $fold pref = $pref\n"; - #my ($cond) = $fold =~ /decode_(\w+)$/; - my ($cond) = $fold =~ /decode_\Q$suff\E\Q${pref}\E_(\w+)$/; - if ($cond =~ /^Sim.+(far|near|cln)|^Real/) { - open(RES, $res_file) or die "$res_file: $_"; - while () { - if (/%WER\s+(\S+)/) { - my $wer = $1; - #print "cond = $cond lmw = $lmw wer = $1\n"; - if ($cond !~ /cln/) { - $sum_wer[$lmw] += $wer; - } - $wer{$cond}[$lmw] = $wer; - } - } - #print "cond = $cond fold = $fold\n"; - } - } -} - -if (!$opt_lmw && $dt_or_et eq "dt") { - $opt_lmw = $min_lmw; - for my $lmw ($min_lmw+1..$max_lmw) { - if ($sum_wer[$lmw] < $sum_wer[$opt_lmw]) { - $opt_lmw = $lmw; - } - } -} - -print "LMW = $opt_lmw\n"; -for my $cond (sort keys %wer) { - print "$cond\t$wer{$cond}[$opt_lmw]\n"; - if ($cond =~ /SimData_[de]t/) { - if ($cond !~ /cln/) { - $avg_wer_disp{"SimData"} += ($wer{$cond}[$opt_lmw] - $avg_wer_disp{"SimData"}) / ++$ns; - } - else { - $avg_wer_disp{"CleanData"} += ($wer{$cond}[$opt_lmw] - $avg_wer_disp{"CleanData"}) / ++$nc; - } - } - elsif ($cond =~ /RealData_[de]t/) { - $avg_wer_disp{"RealData"} += ($wer{$cond}[$opt_lmw] - $avg_wer_disp{"RealData"}) / ++$nr; - } -} - -#print "Avg_Clean($nc)\t", sprintf("%.2f", $avg_wer_disp{"CleanData"}), "\n"; -print "Avg_Sim($ns)\t", sprintf("%.2f", $avg_wer_disp{"SimData"}), "\n"; -print "Avg_Real($nr)\t", sprintf("%.2f", $avg_wer_disp{"RealData"}), "\n"; -print "\n\n"; - -} diff --git a/egs/reverb/s5/local/wsjcam0_format_data.sh b/egs/reverb/s5/local/wsjcam0_format_data.sh index aa1e8224fc9..883cb20ed0e 100755 --- a/egs/reverb/s5/local/wsjcam0_format_data.sh +++ b/egs/reverb/s5/local/wsjcam0_format_data.sh @@ -50,22 +50,8 @@ for lm_suffix in bg_5k tg_5k; do cp -r data/lang/$f $test done gunzip -c $lmdir/lm_${lm_suffix}.arpa.gz | \ - utils/find_arpa_oovs.pl $test/words.txt > $tmpdir/oovs_${lm_suffix}.txt - - # grep -v ' ' because the LM seems to have some strange and useless - # stuff in it with multiple 's in the history. Encountered some other similar - # things in a LM from Geoff. Removing all "illegal" combinations of and , - # which are supposed to occur only at being/end of utt. These can cause - # determinization failures of CLG [ends up being epsilon cycles]. - gunzip -c $lmdir/lm_${lm_suffix}.arpa.gz | \ - grep -v ' ' | \ - grep -v ' ' | \ - grep -v ' ' | \ - arpa2fst - | fstprint | \ - utils/remove_oovs.pl $tmpdir/oovs_${lm_suffix}.txt | \ - utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$test/words.txt \ - --osymbols=$test/words.txt --keep_isymbols=false --keep_osymbols=false | \ - fstrmepsilon | fstarcsort --sort_type=ilabel > $test/G.fst + arpa2fst --disambig-symbol=#0 \ + --read-symbol-table=$test/words.txt - $test/G.fst fstisstochastic $test/G.fst # The output is like: # 9.14233e-05 -0.259833 @@ -83,7 +69,7 @@ for lm_suffix in bg_5k tg_5k; do < "$lexicon" >$tmpdir/g/select_empty.fst.txt fstcompile --isymbols=$test/words.txt --osymbols=$test/words.txt $tmpdir/g/select_empty.fst.txt | \ fstarcsort --sort_type=olabel | fstcompose - $test/G.fst > $tmpdir/g/empty_words.fst - fstinfo $tmpdir/g/empty_words.fst | grep cyclic | grep -w 'y' && + fstinfo $tmpdir/g/empty_words.fst | grep cyclic | grep -w 'y' && echo "Language model has cycles with empty words" && exit 1 rm -r $tmpdir/g done diff --git a/egs/reverb/s5/path.sh b/egs/reverb/s5/path.sh index eea6b7a8293..1a6fb5f891b 100644 --- a/egs/reverb/s5/path.sh +++ b/egs/reverb/s5/path.sh @@ -1,3 +1,5 @@ export KALDI_ROOT=`pwd`/../../.. -export PATH=$PWD/utils/:$KALDI_ROOT/src/bin:$KALDI_ROOT/tools/openfst/bin:$KALDI_ROOT/src/fstbin/:$KALDI_ROOT/src/gmmbin/:$KALDI_ROOT/src/featbin/:$KALDI_ROOT/src/lm/:$KALDI_ROOT/src/sgmmbin/:$KALDI_ROOT/src/sgmm2bin/:$KALDI_ROOT/src/fgmmbin/:$KALDI_ROOT/src/latbin/:$KALDI_ROOT/src/nnetbin:$KALDI_ROOT/src/nnet2bin/:$KALDI_ROOT/src/kwsbin:$PWD:$PATH +export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH +[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1 +. $KALDI_ROOT/tools/config/common_path.sh export LC_ALL=C diff --git a/egs/reverb/s5/run.sh b/egs/reverb/s5/run.sh index 0e3eac6e6c1..ffb0b20422d 100755 --- a/egs/reverb/s5/run.sh +++ b/egs/reverb/s5/run.sh @@ -15,89 +15,92 @@ # See the Apache 2 License for the specific language governing permissions and # limitations under the License. +# This is a shell script, but it's recommended that you run the commands one by +# one by copying and pasting into the shell. +# Caution: some of the graph creation steps use quite a bit of memory, so you +# should run this on a machine that has sufficient memory. + # Requirements) matlab and tcsh if [ ! `which tcsh` ]; then - echo "Install tcsh, which is used in some REVERB scripts" - exit 1 + echo "Install tcsh, which is used in some REVERB scripts" + exit 1 fi if [ ! `which matlab` ]; then - echo "Install matlab, which is used to generate multi-condition data" - exit 1 + echo "Install matlab, which is used to generate multi-condition data" + exit 1 fi -if [ ! -e path.sh ] || [ ! -e corpus.sh ]; then - echo "ERROR: path.sh and/or corpus.sh not found" - echo "You need to create these from {path,corpus}.sh.default to match your system" - echo "Make sure you follow the instructions in ../README.txt" - exit 1 +. ./cmd.sh +. ./path.sh + +stage=1 +. utils/parse_options.sh +# Set bash to 'debug' mode, it prints the commands (option '-x') and exits on : +# -e 'error', -u 'undefined variable', -o pipefail 'error in pipeline', +set -euxo pipefail + +# please make sure to set the paths of the REVERB and WSJ0 data +if [[ $(hostname -f) == *.clsp.jhu.edu ]] ; then + REVERB_home=/export/corpora5/REVERB_2014/REVERB + export wsjcam0=/export/corpora3/LDC/LDC95S24/wsjcam0 + # set LDC WSJ0 directory to obtain LMs + # REVERB data directory only provides bi-gram (bcb05cnp), but this recipe also uses 3-gram (tcb05cnp.z) + export wsj0=/export/corpora5/LDC/LDC93S6A/11-13.1 #LDC93S6A or LDC93S6B + # It is assumed that there will be a 'wsj0' subdirectory + # within the top-level corpus directory +elif [[ $(hostname -f) == *.merl.com ]] ; then + REVERB_home=/db/laputa1/data/original/public/REVERB + export wsjcam0=$REVERB_home/wsjcam0 + # set LDC WSJ0 directory to obtain LMs + # REVERB data directory only provides bi-gram (bcb05cnp), but this recipe also uses 3-gram (tcb05cnp.z) + export wsj0=/db/laputa1/data/original/public/WSJ0/11-13.1 #LDC93S6A or LDC93S6B + # It is assumed that there will be a 'wsj0' subdirectory + # within the top-level corpus directory +else + echo "Set the data directory locations." && exit 1; fi +export reverb_dt=$REVERB_home/REVERB_WSJCAM0_dt +export reverb_et=$REVERB_home/REVERB_WSJCAM0_et +export reverb_real_dt=$REVERB_home/MC_WSJ_AV_Dev +export reverb_real_et=$REVERB_home/MC_WSJ_AV_Eval -. ./cmd.sh - -# please make sure to set the paths of the REVERB and WSJ0 data -. ./corpus.sh - -# set the directory of the multi-condition training data generated +# set the directory of the multi-condition training data to be generated reverb_tr=`pwd`/data_tr_cut/REVERB_WSJCAM0_tr_cut # LDA context size (left/right) (4 is default) context_size=4 -# The language models with which to decode (tg_5k or bg_5k or "tg_5k bg_5k" for -# both) -lms="bg_5k tg_5k" +# The language models with which to decode (tg_5k or bg_5k) +lm="tg_5k" # number of jobs for feature extraction and model training nj_train=30 # number of jobs for decoding -# use less jobs for trigram model -# if you have enough RAM (~ 32 GB), you can use 8 jobs for trigram as well -nj_bg=8 -nj_tg=8 -nj_bg=25 ## -nj_tg=25 ## - -# set to true if running from scratch -do_prep=true +nj_decode=8 # set to true if you want the tri2a systems (re-implementation of the HTK baselines) do_tri2a=true - -# The following are the settings determined by Gaussian Process optimization. -# However, they are not used in the final system. -# You can use the code below for training the "tri2c_mc" system. - -# LDA parameters for MCT recognizer. -# Use significantly more context than the default (7 frames ~ 85 ms) -mct_lda_left_context=7 -mct_lda_right_context=5 - -# Number of states and Gaussians for the MCT recognizer. -mct_nstates=7500 -mct_ngauss=45000 - -## End of GP tuned settings - -false && { -if $do_prep; then +if [ $stage -le 1 ]; then # Generate multi-condition training data # Note that utterance lengths match the original set. # This enables using clean alignments in multi-condition training (stereo training) - #local/REVERB_create_mcdata.sh $wsjcam0 $reverb_tr + local/REVERB_create_mcdata.sh $wsjcam0 $reverb_tr +fi +if [ $stage -le 2 ]; then # Prepare wsjcam0 clean data and wsj0 language model. - local/wsjcam0_data_prep.sh $wsjcam0 $wsj0 || exit 1 + local/wsjcam0_data_prep.sh $wsjcam0 $wsj0 # Prepare merged BEEP/CMU dictionary. - local/wsj_prepare_beep_dict.sh || exit 1; + local/wsj_prepare_beep_dict.sh # Prepare wordlists, etc. - utils/prepare_lang.sh data/local/dict "" data/local/lang_tmp data/lang || exit 1; + utils/prepare_lang.sh data/local/dict "" data/local/lang_tmp data/lang # Prepare directory structure for clean data. Apply some language model fixes. - local/wsjcam0_format_data.sh || exit 1; + local/wsjcam0_format_data.sh # Now it's getting more interesting. # Prepare the multi-condition training data and the REVERB dt set. @@ -108,253 +111,227 @@ if $do_prep; then # local/REVERB_wsjcam0_data_prep.sh /path/to/processed/REVERB_WSJCAM0_dt processed_REVERB_dt dt # The first argument is supposed to point to a folder that has the same structure # as the REVERB corpus. - local/REVERB_wsjcam0_data_prep.sh $reverb_tr REVERB_tr_cut tr || exit 1; - local/REVERB_wsjcam0_data_prep.sh $reverb_dt REVERB_dt dt || exit 1; - local/REVERB_wsjcam0_data_prep.sh $reverb_et REVERB_et et || exit 1; + local/REVERB_wsjcam0_data_prep.sh $reverb_tr REVERB_tr_cut tr + local/REVERB_wsjcam0_data_prep.sh $reverb_dt REVERB_dt dt + local/REVERB_wsjcam0_data_prep.sh $reverb_et REVERB_et et # Prepare the REVERB "real" dt set from MCWSJAV corpus. # This corpus is *never* used for training. # This creates the data set called REVERB_Real_dt and its subfolders - local/REVERB_mcwsjav_data_prep.sh $reverb_real_dt REVERB_Real_dt dt || exit 1; + local/REVERB_mcwsjav_data_prep.sh $reverb_real_dt REVERB_Real_dt dt # The MLF file exists only once in the corpus, namely in the real_dt directory # so we pass it as 4th argument - local/REVERB_mcwsjav_data_prep.sh $reverb_real_et REVERB_Real_et et $reverb_real_dt/mlf/WSJ.mlf || exit 1; + local/REVERB_mcwsjav_data_prep.sh $reverb_real_et REVERB_Real_et et $reverb_real_dt/mlf/WSJ.mlf +fi +if [ $stage -le 3 ]; then # Extract MFCC features for clean sets. # For the non-clean data sets, this is outsourced to the data preparation scripts. mfccdir=mfcc ### for x in si_tr si_dt; do it seems that the number of transcriptions of si_dt is not correct. - for x in si_tr; do - steps/make_mfcc.sh --nj $nj_train \ - data/$x exp/make_mfcc/$x $mfccdir || exit 1; - steps/compute_cmvn_stats.sh data/$x exp/make_mfcc/$x $mfccdir || exit 1; + for x in si_tr; do + steps/make_mfcc.sh --cmd "$train_cmd" --nj $nj_train \ + data/$x exp/make_mfcc/$x $mfccdir + steps/compute_cmvn_stats.sh data/$x exp/make_mfcc/$x $mfccdir done fi -# Train monophone model on clean data (si_tr). -if [ ! -e exp/mono0a/final.mdl ]; then - echo "### TRAINING mono0a ###" - steps/train_mono.sh --boost-silence 1.25 --nj $nj_train \ - data/si_tr data/lang exp/mono0a || exit 1; +if [ $stage -le 4 ]; then + # Train monophone model on clean data (si_tr). + echo "### TRAINING mono0a ###" + steps/train_mono.sh --boost-silence 1.25 --nj $nj_train --cmd "$train_cmd" \ + data/si_tr data/lang exp/mono0a + + # Align monophones with clean data. + echo "### ALIGNING mono0a_ali ###" + steps/align_si.sh --boost-silence 1.25 --nj $nj_train --cmd "$train_cmd" \ + data/si_tr data/lang exp/mono0a exp/mono0a_ali + + # Create first triphone recognizer. + echo "### TRAINING tri1 ###" + steps/train_deltas.sh --boost-silence 1.25 --cmd "$train_cmd" \ + 2000 10000 data/si_tr data/lang exp/mono0a_ali exp/tri1 + + echo "### ALIGNING tri1_ali ###" + # Re-align triphones. + steps/align_si.sh --nj $nj_train --cmd "$train_cmd" \ + data/si_tr data/lang exp/tri1 exp/tri1_ali fi -# Align monophones with clean data. -if [ ! -e exp/mono0a_ali/ali.1.gz ]; then - echo "### ALIGNING mono0a_ali ###" - steps/align_si.sh --boost-silence 1.25 --nj $nj_train \ - data/si_tr data/lang exp/mono0a exp/mono0a_ali || exit 1; -fi - -# Create first triphone recognizer. -if [ ! -e exp/tri1/final.mdl ]; then - echo "### TRAINING tri1 ###" - steps/train_deltas.sh --boost-silence 1.25 \ - 2000 10000 data/si_tr data/lang exp/mono0a_ali exp/tri1 || exit 1; -fi - -# Prepare first triphone recognizer and decode clean si_dt for verification. -#utils/mkgraph.sh data/lang_test_bg_5k exp/tri1 exp/tri1/graph_bg_5k || exit 1; -#steps/decode.sh --nj 8 exp/tri1/graph_bg_5k data/si_dt exp/tri1/decode_si_dt - -if [ ! -e exp/tri1_ali/ali.1.gz ]; then - echo "### ALIGNING tri1_ali ###" - # Re-align triphones. - steps/align_si.sh --nj $nj_train \ - data/si_tr data/lang exp/tri1 exp/tri1_ali || exit 1; -fi - - # The following code trains and evaluates a delta feature recognizer, which is similar to the HTK # baseline (but using per-utterance basis fMLLR instead of batch MLLR). This is for reference only. if $do_tri2a; then +if [ $stage -le 5 ]; then # Train tri2a, which is deltas + delta-deltas, on clean data. - steps/train_deltas.sh \ - 2500 15000 data/si_tr data/lang exp/tri1_ali exp/tri2a || exit 1; + steps/train_deltas.sh --cmd "$train_cmd" \ + 2500 15000 data/si_tr data/lang exp/tri1_ali exp/tri2a # Re-align triphones using clean data. This gives a smallish performance gain. - steps/align_si.sh --nj $nj_train \ - data/si_tr data/lang exp/tri2a exp/tri2a_ali || exit 1; + steps/align_si.sh --nj $nj_train --cmd "$train_cmd" \ + data/si_tr data/lang exp/tri2a exp/tri2a_ali # Train a multi-condition triphone recognizer. # This uses alignments on *clean* data, which is allowed for REVERB. - # However, we have to use the "cut" version so that the length of the + # However, we have to use the "cut" version so that the length of the # waveforms match. # It is actually asserted by the Challenge that clean and multi-condition waves are aligned. - steps/train_deltas.sh \ - 2500 15000 data/REVERB_tr_cut/SimData_tr_for_1ch_A data/lang exp/tri2a_ali exp/tri2a_mc || exit 1; + steps/train_deltas.sh --cmd "$train_cmd" \ + 2500 15000 data/REVERB_tr_cut/SimData_tr_for_1ch_A data/lang exp/tri2a_ali exp/tri2a_mc # Prepare clean and mc tri2a models for decoding. - utils/mkgraph.sh data/lang_test_bg_5k exp/tri2a exp/tri2a/graph_bg_5k - utils/mkgraph.sh data/lang_test_bg_5k exp/tri2a_mc exp/tri2a_mc/graph_bg_5k + utils/mkgraph.sh data/lang_test_bg_5k exp/tri2a exp/tri2a/graph_bg_5k & + utils/mkgraph.sh data/lang_test_bg_5k exp/tri2a_mc exp/tri2a_mc/graph_bg_5k & + wait +fi +if [ $stage -le 6 ]; then # decode REVERB dt using tri2a, clean - for dataset in data/REVERB_dt/SimData_dt* data/REVERB_Real_dt/RealData_dt*; do - steps/decode.sh --nj $nj_bg \ - exp/tri2a/graph_bg_5k $dataset exp/tri2a/decode_bg_5k_REVERB_dt_`basename $dataset` || exit 1; + for dataset in data/REVERB_*{dt,et}/*; do + steps/decode.sh --nj $nj_decode --cmd "$decode_cmd" \ + exp/tri2a/graph_bg_5k $dataset exp/tri2a/decode_bg_5k_`echo $dataset | awk -F '/' '{print $2 "_" $3}'` & done # decode REVERB dt using tri2a, mc - for dataset in data/REVERB_dt/SimData_dt* data/REVERB_Real_dt/RealData_dt*; do - steps/decode.sh --nj $nj_bg \ - exp/tri2a_mc/graph_bg_5k $dataset exp/tri2a_mc/decode_bg_5k_REVERB_dt_`basename $dataset` || exit 1; + for dataset in data/REVERB_*{dt,et}/*; do + steps/decode.sh --nj $nj_decode --cmd "$decode_cmd" \ + exp/tri2a_mc/graph_bg_5k $dataset exp/tri2a_mc/decode_bg_5k_`echo $dataset | awk -F '/' '{print $2 "_" $3}'` & done + # basis fMLLR for tri2a_mc system # This computes a transform for every training utterance and computes a basis from that. - steps/get_fmllr_basis.sh --per-utt true data/REVERB_tr_cut/SimData_tr_for_1ch_A data/lang exp/tri2a_mc || exit 1; + steps/get_fmllr_basis.sh --cmd "$train_cmd" --per-utt true data/REVERB_tr_cut/SimData_tr_for_1ch_A data/lang exp/tri2a_mc # Recognition using fMLLR adaptation (per-utterance processing). - for dataset in data/REVERB_dt/SimData_dt* data/REVERB_Real_dt/RealData_dt*; do - steps/decode_basis_fmllr.sh --nj $nj_bg \ - exp/tri2a_mc/graph_bg_5k $dataset exp/tri2a_mc/decode_basis_fmllr_bg_5k_REVERB_dt_`basename $dataset` || exit 1; + for dataset in data/REVERB_*{dt,et}/*; do + steps/decode_basis_fmllr.sh --nj $nj_decode --cmd "$decode_cmd" \ + exp/tri2a_mc/graph_bg_5k $dataset exp/tri2a_mc/decode_basis_fmllr_bg_5k_`echo $dataset | awk -F '/' '{print $2 "_" $3}'` & done - -fi # train tri2a, tri2a_mc - - -# Train tri2b recognizer, which uses LDA-MLLT, using the default parameters from the WSJ recipe. -if [ ! -e exp/tri2b/final.mdl ]; then - echo "### TRAINING tri2b ###" - steps/train_lda_mllt.sh \ - --splice-opts "--left-context=$context_size --right-context=$context_size" \ - 2500 15000 data/si_tr data/lang exp/tri1_ali exp/tri2b || exit 1; + wait fi - -# tri2b (LDA-MLLT system) with multi-condition training, using default parameters. -if [ ! -e exp/tri2b_mc/final.mdl ]; then - echo "### TRAINING tri2b_mc ###" - steps/train_lda_mllt.sh \ - --splice-opts "--left-context=$context_size --right-context=$context_size" \ - 2500 15000 data/REVERB_tr_cut/SimData_tr_for_1ch_A data/lang exp/tri1_ali exp/tri2b_mc || exit 1; fi - -# tri2c (LDA-MLLT system) with multi-condition training, optimized parameters. -# Disabled by default -- it only improves slightly, and tends to overfit. -if [ ! -e exp/tri2c_mc/final.mdl ]; then - echo "### TRAINING tri2c_mc ###" - steps/train_lda_mllt.sh \ - --splice-opts "--left-context=$mct_lda_left_context --right-context=$mct_lda_right_context" \ - $mct_nstates $mct_ngauss data/REVERB_tr_cut/SimData_tr_for_1ch_A data/lang exp/tri1_ali exp/tri2c_mc || exit 1; +if [ $stage -le 7 ]; then + # Train tri2b recognizer, which uses LDA-MLLT, using the default parameters from the WSJ recipe. + echo "### TRAINING tri2b ###" + steps/train_lda_mllt.sh --cmd "$train_cmd" \ + --splice-opts "--left-context=$context_size --right-context=$context_size" \ + 2500 15000 data/si_tr data/lang exp/tri1_ali exp/tri2b + + # tri2b (LDA-MLLT system) with multi-condition training, using default parameters. + echo "### TRAINING tri2b_mc ###" + steps/train_lda_mllt.sh --cmd "$train_cmd"\ + --splice-opts "--left-context=$context_size --right-context=$context_size" \ + 2500 15000 data/REVERB_tr_cut/SimData_tr_for_1ch_A data/lang exp/tri1_ali exp/tri2b_mc fi - # Prepare tri2b* systems for decoding. -for recog in tri2b tri2b_mc; do - for lm in $lms; do - graph=exp/$recog/graph_$lm - if [ ! -e "$graph" ]; then - echo "### MAKING GRAPH $graph ###" - utils/mkgraph.sh data/lang_test_$lm exp/$recog $graph || exit 1; - fi - done -done - +if [ $stage -le 8 ]; then + echo "### MAKING GRAPH {tri2b,tri2b_mc}/graph_$lm ###" + for recog in tri2b tri2b_mc; do + utils/mkgraph.sh data/lang_test_$lm exp/$recog exp/$recog/graph_$lm & + done + wait +fi # discriminative training on top of multi-condition systems # one could also add tri2b here to have a DT clean recognizer for reference -for base_recog in tri2b_mc; do - - bmmi_recog=${base_recog}_mmi_b0.1 - echo "### DT $base_recog --> $bmmi_recog ###" +if [ $stage -le 9 ]; then + base_recog=tri2b_mc + bmmi_recog=${base_recog}_mmi_b0.1 + echo "### DT $base_recog --> $bmmi_recog ###" + + # get alignments from base recognizer + steps/align_si.sh --nj $nj_train --cmd "$train_cmd" \ + --use-graphs true data/REVERB_tr_cut/SimData_tr_for_1ch_A data/lang exp/$base_recog exp/${base_recog}_ali + + # get lattices from base recognizer + denlats_dir=${base_recog}_denlats + subsplit=`echo $nj_train \* 2 | bc` + # DT with multi-condition data ... + steps/make_denlats.sh --sub-split $subsplit --nj $nj_train --cmd "$decode_cmd" \ + data/REVERB_tr_cut/SimData_tr_for_1ch_A data/lang exp/$base_recog exp/$denlats_dir + + # boosted MMI training + steps/train_mmi.sh --boost 0.1 --cmd "$train_cmd" \ + data/REVERB_tr_cut/SimData_tr_for_1ch_A \ + data/lang \ + exp/${base_recog}_ali \ + exp/$denlats_dir \ + exp/$bmmi_recog + cp exp/$base_recog/ali.* exp/$bmmi_recog +fi - # get alignments from base recognizer - if [ ! -e exp/${base_recog}_ali/ali.1.gz ]; then - steps/align_si.sh --nj $nj_train \ - --use-graphs true data/REVERB_tr_cut/SimData_tr_for_1ch_A data/lang exp/$base_recog exp/${base_recog}_ali || exit 1; - fi +# decoding using various recognizers +if [ $stage -le 10 ]; then + # put tri2b last since it takes longest due to the large mismatch. + for recog in tri2b_mc tri2b_mc_mmi_b0.1 tri2b; do + # The graph from the ML directory is used in recipe + recog2=`echo $recog | sed s/_mmi.*//` + graph=exp/$recog2/graph_$lm + + echo "### DECODING with $recog, noadapt, $lm ###" + for dataset in data/REVERB_*{dt,et}/*; do + decode_suff=${lm}_`echo $dataset | awk -F '/' '{print $2 "_" $3}'` + steps/decode.sh --nj $nj_decode --cmd "$decode_cmd" \ + $graph $dataset \ + exp/$recog/decode_$decode_suff & + done + wait + + echo " ## MBR RESCORING with $recog, noadapt ##" + for dataset in data/REVERB_*{dt,et}/*; do + decode_suff=${lm}_`echo $dataset | awk -F '/' '{print $2 "_" $3}'` + mkdir -p exp/$recog/decode_mbr_$decode_suff + cp exp/$recog/decode_$decode_suff/lat.*.gz exp/$recog/decode_mbr_$decode_suff + local/score_mbr.sh --cmd "$decode_cmd" \ + $dataset data/lang_test_$lm/ exp/$recog/decode_mbr_$decode_suff & + done + wait - # get lattices from base recognizer - denlats_dir=${base_recog}_denlats - subsplit=`echo $nj_train \* 2 | bc` - if [ ! -e exp/$denlats_dir/.done.1 ]; then - # DT with multi-condition data ... - steps/make_denlats.sh --sub-split $subsplit --nj $nj_train \ - data/REVERB_tr_cut/SimData_tr_for_1ch_A data/lang exp/$base_recog exp/$denlats_dir || exit 1; - fi + done # loop recog +fi - # boosted MMI training - if [ ! -e exp/$bmmi_recog/final.mdl ]; then - steps/train_mmi.sh --boost 0.1 \ - data/REVERB_tr_cut/SimData_tr_for_1ch_A \ - data/lang \ - exp/${base_recog}_ali \ - exp/$denlats_dir \ - exp/$bmmi_recog || exit 1; - cp exp/$base_recog/ali.* exp/$bmmi_recog +# decoding using various recognizers with adaptation +if [ $stage -le 11 ]; then + # put tri2b last since it takes longest due to the large mismatch. + for recog in tri2b_mc tri2b_mc_mmi_b0.1 tri2b; do + # The graph from the ML directory is used in recipe + recog2=`echo $recog | sed s/_mmi.*//` + graph=exp/$recog2/graph_$lm + + # set the adaptation data + if [[ "$recog" =~ _mc ]]; then + tr_dataset=REVERB_tr_cut/SimData_tr_for_1ch_A + else + tr_dataset=si_tr fi -done - -} + echo "### DECODING with $recog, basis_fmllr, $lm ###" + steps/get_fmllr_basis.sh --cmd "$train_cmd" --per-utt true data/$tr_dataset data/lang exp/$recog + for dataset in data/REVERB_*{dt,et}/*; do + ( + decode_suff=${lm}_`echo $dataset | awk -F '/' '{print $2 "_" $3}'` + steps/decode_basis_fmllr.sh --nj $nj_decode --cmd "$decode_cmd" \ + $graph $dataset \ + exp/$recog/decode_basis_fmllr_$decode_suff + ) & + done + wait + + echo " ## MBR RESCORING with $recog, basis_fmllr ##" + for dataset in data/REVERB_*{dt,et}/*; do + decode_suff=${lm}_`echo $dataset | awk -F '/' '{print $2 "_" $3}'` + mkdir -p exp/$recog/decode_mbr_basis_fmllr_$decode_suff + cp exp/$recog/decode_basis_fmllr_$decode_suff/lat.*.gz exp/$recog/decode_mbr_basis_fmllr_$decode_suff + local/score_mbr.sh --cmd "$decode_cmd" \ + $dataset data/lang_test_$lm/ exp/$recog/decode_mbr_basis_fmllr_$decode_suff & + done + wait -# decoding using bigram / trigram and various recognizers -do_adapt=true -for lm in $lms; do - if [[ "$lm" =~ tg ]]; then - nj=$nj_tg - else - nj=$nj_bg - fi - # put tri2b last since it takes longest due to the large mismatch. - for recog in tri2b_mc tri2b_mc_mmi_b0.1 tri2b; do - # The graph from the ML directory is used in recipe - recog2=`echo $recog | sed s/_mmi.*//` - graph=exp/$recog2/graph_$lm - for dataset in data/REVERB_dt/SimData_dt* \ - data/REVERB_et/SimData_et* \ - data/REVERB_Real_dt/RealData_dt* \ - data/REVERB_Real_et/RealData_et*; do - if [[ $dataset =~ _dt ]]; then - pdataset=REVERB_dt - elif [[ $dataset =~ _et ]]; then - pdataset=REVERB_et - else - echo "$0: Cannot figure out what to do with: $dataset" - exit 1 - fi - #pdataset=$(basename $(dirname $dataset)) - #echo $pdataset - decode_suff=${lm}_${pdataset}_`basename $dataset` - if [ ! -e exp/$recog/decode_$decode_suff/wer_15 ]; then - echo "### DECODING $dataset | $recog, noadapt, $lm ###" - steps/decode.sh --nj $nj \ - $graph $dataset \ - exp/$recog/decode_$decode_suff || exit 1; - fi - if [ ! -e exp/$recog/decode_mbr_$decode_suff/wer_15 ]; then - mkdir -p exp/$recog/decode_mbr_$decode_suff - cp exp/$recog/decode_$decode_suff/lat.*.gz exp/$recog/decode_mbr_$decode_suff - echo " ## MBR RESCORING $dataset | $recog, noadapt ##" - local/score_mbr.sh \ - $dataset data/lang_test_$lm/ exp/$recog/decode_mbr_$decode_suff || exit 1 - fi - if $do_adapt; then - if [ ! -e exp/$recog/fmllr.basis ]; then - if [[ "$recog" =~ _mc ]]; then - tr_dataset=REVERB_tr_cut/SimData_tr_for_1ch_A - else - tr_dataset=si_tr - fi - steps/get_fmllr_basis.sh --per-utt true data/$tr_dataset data/lang exp/$recog || exit 1; - fi - if [ ! -e exp/$recog/decode_basis_fmllr_$decode_suff/wer_15 ]; then - echo "### DECODING $dataset | $recog, basis_fmllr, $lm ###" - steps/decode_basis_fmllr.sh --nj $nj \ - $graph $dataset \ - exp/$recog/decode_basis_fmllr_$decode_suff || exit 1; - fi - if [ ! -e exp/$recog/decode_mbr_basis_fmllr_$decode_suff/wer_15 ]; then - mkdir -p exp/$recog/decode_mbr_basis_fmllr_$decode_suff - cp exp/$recog/decode_basis_fmllr_$decode_suff/lat.*.gz exp/$recog/decode_mbr_basis_fmllr_$decode_suff - echo " ## MBR RESCORING $dataset | $recog, basis_fmllr ##" - local/score_mbr.sh \ - $dataset data/lang_test_$lm/ exp/$recog/decode_mbr_basis_fmllr_$decode_suff || exit 1 - fi - fi - - done # loop data set - done # loop recog -done # loop LM + done # loop recog +fi # get all WERs with lmw=15 -local/get_results.sh +if [ $stage -le 12 ]; then + local/get_results.sh +fi diff --git a/egs/rm/s5/RESULTS b/egs/rm/s5/RESULTS index 11587e765c7..1014fce03ed 100644 --- a/egs/rm/s5/RESULTS +++ b/egs/rm/s5/RESULTS @@ -1,5 +1,5 @@ -for x in exp/*/decode*; do [ -d $x ] && grep WER $x/wer_* | utils/best_wer.sh; done - +#!/bin/bash +for x in exp/*/decode*; do [ -d $x ] && [[ $x =~ "$1" ]] && grep WER $x/wer_* | utils/best_wer.sh; done exit 0 # Monophone, MFCC+delta+accel @@ -163,7 +163,6 @@ exit 0 %WER 7.73 [ 969 / 12533, 74 ins, 157 del, 738 sub ] exp/nnet5e_mpe_gpu/decode_ug_epoch4/wer_9 - # Some system combination experiments. %WER 3.18 [ 398 / 12533, 60 ins, 75 del, 263 sub ] exp/combine_1_2a/decode/wer_4 %WER 1.56 [ 196 / 12533, 27 ins, 32 del, 137 sub ] exp/combine_sgmm2_4a_3b/decode/wer_2 @@ -230,34 +229,61 @@ for x in exp/nnet2_online_wsj/nnet_ms_a_smbr_0.00005/1/decode_*; do grep WER $x/ %WER 7.33 [ 919 / 12533, 80 ins, 153 del, 686 sub ] exp/nnet2_online_wsj/nnet_ms_a_smbr_0.00005/1/decode_ug_epoch3/wer_13 %WER 7.36 [ 923 / 12533, 85 ins, 148 del, 690 sub ] exp/nnet2_online_wsj/nnet_ms_a_smbr_0.00005/1/decode_ug_epoch4/wer_13 +### chain results ### +# current best chain result with TDNN (check local/chain/run_tdnn_5f.sh) +%WER 2.94 [ 369 / 12533, 51 ins, 71 del, 247 sub ] exp/chain/tdnn_5f/decode/wer_3_0.5 ### nnet1 results ### -# DNN systems (Karel - 25.9.2014) -# Per-frame cross-entropy training -%WER 1.63 [ 204 / 12533, 32 ins, 42 del, 130 sub ] exp/dnn4b_pretrain-dbn_dnn/decode/wer_3 -%WER 7.77 [ 974 / 12533, 81 ins, 158 del, 735 sub ] exp/dnn4b_pretrain-dbn_dnn/decode_ug/wer_7 -# Sequence-based sMBR training -%WER 1.61 [ 202 / 12533, 32 ins, 42 del, 128 sub ] exp/dnn4b_pretrain-dbn_dnn_smbr/decode_it1/wer_3 -%WER 1.62 [ 203 / 12533, 33 ins, 42 del, 128 sub ] exp/dnn4b_pretrain-dbn_dnn_smbr/decode_it2/wer_3 -%WER 1.63 [ 204 / 12533, 32 ins, 42 del, 130 sub ] exp/dnn4b_pretrain-dbn_dnn_smbr/decode_it3/wer_3 -%WER 1.64 [ 206 / 12533, 32 ins, 42 del, 132 sub ] exp/dnn4b_pretrain-dbn_dnn_smbr/decode_it4/wer_3 -%WER 1.63 [ 204 / 12533, 32 ins, 41 del, 131 sub ] exp/dnn4b_pretrain-dbn_dnn_smbr/decode_it5/wer_3 -%WER 1.64 [ 206 / 12533, 20 ins, 58 del, 128 sub ] exp/dnn4b_pretrain-dbn_dnn_smbr/decode_it6/wer_5 - -# CNN systems (Karel - 25.9.2014) -%WER 1.89 [ 237 / 12533, 30 ins, 47 del, 160 sub ] exp/cnn4c/decode/wer_3 # per-frame training -# 2D-CNN system (from Harish Mallidi, run by Karel - 22.6.2015) -%WER 2.07 [ 260 / 12533, 32 ins, 60 del, 168 sub ] exp/cnn2d4c/decode/wer_4_0.0 # per-frame training - -# Joint training with WSJ data, FBANK+pitch features. 2 softmax layers, multitask training, -# (Karel - 10.7.2015) -%WER 1.52 [ 191 / 12533, 17 ins, 52 del, 122 sub ] exp/dnn4e-fbank_blocksoftmax/decode/wer_4_0.5 + +# dnn4b, MFCC,LDA,fMLLR feaures, (Karel - 30.7.2015) +# Xent, +%WER 1.75 [ 219 / 12533, 36 ins, 35 del, 148 sub ] exp/dnn4b_pretrain-dbn_dnn/decode/wer_2_0.0 +%WER 7.90 [ 990 / 12533, 90 ins, 147 del, 753 sub ] exp/dnn4b_pretrain-dbn_dnn/decode_ug/wer_5_1.0 +# sMBR, +%WER 1.77 [ 222 / 12533, 21 ins, 57 del, 144 sub ] exp/dnn4b_pretrain-dbn_dnn_smbr/decode_it1/wer_4_0.0 +%WER 1.68 [ 210 / 12533, 24 ins, 43 del, 143 sub ] exp/dnn4b_pretrain-dbn_dnn_smbr/decode_it3/wer_4_0.0 +%WER 1.58 [ 198 / 12533, 20 ins, 41 del, 137 sub ] exp/dnn4b_pretrain-dbn_dnn_smbr/decode_it6/wer_5_0.0 + +# cnn4c, FBANK+pitch features, (Karel - 30.7.2015) +# Xent, no-RBM, +%WER 2.00 [ 251 / 12533, 34 ins, 54 del, 163 sub ] exp/cnn4c/decode/wer_3_0.5 +# Xent, RBM on top of CNN, +%WER 2.04 [ 256 / 12533, 20 ins, 78 del, 158 sub ] exp/cnn4c_pretrain-dbn_dnn/decode/wer_6_0.5 +# sMBR, +%WER 2.02 [ 253 / 12533, 35 ins, 54 del, 164 sub ] exp/cnn4c_pretrain-dbn_dnn_smbr/decode_it1/wer_5_0.0 +%WER 1.93 [ 242 / 12533, 23 ins, 62 del, 157 sub ] exp/cnn4c_pretrain-dbn_dnn_smbr/decode_it3/wer_6_0.5 +%WER 1.90 [ 238 / 12533, 29 ins, 49 del, 160 sub ] exp/cnn4c_pretrain-dbn_dnn_smbr/decode_it6/wer_6_0.0 + +# dnn4d, FBANK+pitch, (Karel - 30.7.2015) +# Xent, +%WER 1.95 [ 245 / 12533, 22 ins, 63 del, 160 sub ] exp/dnn4d-fbank_pretrain-dbn_dnn/decode/wer_4_1.0 +# sMBR, +%WER 1.98 [ 248 / 12533, 35 ins, 50 del, 163 sub ] exp/dnn4d-fbank_pretrain-dbn_dnn_smbr/decode_it1/wer_3_0.0 +%WER 1.91 [ 239 / 12533, 19 ins, 60 del, 160 sub ] exp/dnn4d-fbank_pretrain-dbn_dnn_smbr/decode_it3/wer_5_0.5 +%WER 1.88 [ 236 / 12533, 17 ins, 61 del, 158 sub ] exp/dnn4d-fbank_pretrain-dbn_dnn_smbr/decode_it6/wer_6_0.5 + +# dnn4e, FBANK+pitch, 2 output layers: rm + wsj, (Karel - 10.7.2015) +%WER 1.52 [ 191 / 12533, 17 ins, 52 del, 122 sub ] exp/dnn4e-fbank_blocksoftmax/decode/wer_4_0.5 <<<[BEST] %WER 7.86 [ 985 / 12533, 84 ins, 160 del, 741 sub ] exp/dnn4e-fbank_blocksoftmax/decode_ug/wer_8_0.0 -# LSTM result -for x in exp/lstm4f/decode*; do [ -d $x ] && grep WER $x/wer_* | utils/best_wer.sh; done -%WER 2.04 [ 256 / 12533, 18 ins, 60 del, 178 sub ] exp/lstm4f_c512_r200_c512_r200_lr0.0001_mmt0.9_clip50/decode/wer_4_0.5 -# BLSTM result +# lstm4f, FBANK+pitch, 2LSTMs, (Karel - 30.7.2015) +%WER 2.15 [ 270 / 12533, 20 ins, 69 del, 181 sub ] exp/lstm4f/decode/wer_5_0.0 + +# cnn4g-2D, FBANK+pitch, 2D-CNN system (from Harish Mallidi, run by Karel - 22.6.2015) +%WER 2.07 [ 260 / 12533, 32 ins, 60 del, 168 sub ] exp/cnn2d4c/decode/wer_4_0.0 + +# dnn4h, FBANK+pitch, ``dummy ivector'', should be same as 'dnn4d', (Karel - 30.7.2015) +# Xent, no-RBM, +%WER 2.14 [ 268 / 12533, 29 ins, 71 del, 168 sub ] exp/dnn4h-dummy-ivec/decode/wer_4_0.0 +# Xent, RBM, +%WER 1.84 [ 230 / 12533, 29 ins, 51 del, 150 sub ] exp/dnn4h-dummy-ivec_pretrain-dbn_dnn/decode/wer_3_1.0 +# sMBR, +%WER 1.83 [ 229 / 12533, 29 ins, 50 del, 150 sub ] exp/dnn4h-dummy-ivec_pretrain-dbn_dnn_smbr/decode_it1/wer_3_1.0 +%WER 1.81 [ 227 / 12533, 29 ins, 49 del, 149 sub ] exp/dnn4h-dummy-ivec_pretrain-dbn_dnn_smbr/decode_it3/wer_3_1.0 +%WER 1.86 [ 233 / 12533, 34 ins, 46 del, 153 sub ] exp/dnn4h-dummy-ivec_pretrain-dbn_dnn_smbr/decode_it6/wer_3_0.5 + +# blstm4i, FBANK+pitch, (Karel - ??.6.2015) %WER 2.09 [ 262 / 12533, 25 ins, 69 del, 168 sub ] exp/blstm4g/decode/wer_4_0.0 -### nnet1 results, the end ### +### ^^^ nnet1 results ^^^ ### + diff --git a/egs/rm/s5/cmd.sh b/egs/rm/s5/cmd.sh index 4478796305e..6e2f3e9ee48 100644 --- a/egs/rm/s5/cmd.sh +++ b/egs/rm/s5/cmd.sh @@ -1,30 +1,31 @@ -# "queue.pl" uses qsub. The options to it are -# options to qsub. If you have GridEngine installed, -# change this to a queue you have access to. -# Otherwise, use "run.pl", which will run jobs locally -# (make sure your --num-jobs options are no more than -# the number of cpus on your machine. +# you can change cmd.sh depending on what type of queue you are using. +# If you have no queueing system and want to run on a local machine, you +# can change all instances 'queue.pl' to run.pl (but be careful and run +# commands one by one: most recipes will exhaust the memory on your +# machine). queue.pl works with GridEngine (qsub). slurm.pl works +# with slurm. Different queues are configured differently, with different +# queue names and different ways of specifying things like memory; +# to account for these differences you can create and edit the file +# conf/queue.conf to match your queue's configuration. Search for +# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, +# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. -train_cmd="queue.pl -l arch=*64" -decode_cmd="queue.pl -l arch=*64" +export train_cmd=queue.pl +export decode_cmd=queue.pl +export mkgraph_cmd=queue.pl +export cuda_cmd="queue.pl --gpu 1" -# cuda_cmd is used for nnet1 scripts e.g. local/run_dnn.sh, but -# in the nnet2 scripts e.g. local/run_nnet2.sh, this is not -# used and we append options to train_cmd. -cuda_cmd="queue.pl -l arch=*64 -l gpu=1" - -#train_cmd="run.pl" -# with run.pl we do training locally. Note: for jobs on smallish subsets, -# it's way faster to run on a single machine with a handful of CPUs, as -# you avoid the latency of starting GridEngine jobs. +# The rest of this file is here for historical reasons. For cluster-specific +# configuration it's generally better to use conf/queue.conf, see +# http://kaldi-asr.org/doc/queue.html. # BUT cluster: if [ "$(hostname -d)" == "fit.vutbr.cz" ]; then queue="all.q@@blade,all.q@@speech" - gpu_queue="long.q@supergpu*,long.q@dellgpu*,long.q@pcspeech-gpu,long.q@pcgpu*" + gpu_queue="long.q@@gpu" storage="matylda5" export train_cmd="queue.pl -q $queue -l ram_free=1500M,mem_free=1500M,${storage}=1" export decode_cmd="queue.pl -q $queue -l ram_free=2500M,mem_free=2500M,${storage}=0.5" - export cuda_cmd="queue.pl -q $gpu_queue -l gpu=1" + export cuda_cmd="queue.pl -q $gpu_queue -l gpu=1" fi diff --git a/egs/rm/s5/conf/decode_dnn.config b/egs/rm/s5/conf/decode_dnn.config index e5f85633c5b..e7cfca74763 100644 --- a/egs/rm/s5/conf/decode_dnn.config +++ b/egs/rm/s5/conf/decode_dnn.config @@ -1,13 +1,8 @@ -# RM setup has weird optimal scaling (ACWT is 1/3) -# -# This is much larger than 1/10 on SWBD, we use pseudo LM, -# so LM scores are likely to be overboosted. -# For the discriminative training we will still use acwt 0.1, -# scaling down the LM scores did not bring significant improvement. -# +# In RM, the optimal decode LMWT is in range 2..5, which is different from usual 10..15 +# (it is caused by using simple rule-based LM, instead of n-gram LM), +scoring_opts="--min-lmwt 2 --max-lmwt 10" +# Still, it is better to use --acwt 0.1, both for decoding and sMBR, acwt=0.1 -# Large acwt, beams need to be larger too: +# For this small task we can afford to have large beams, beam=30.0 # beam for decoding. Was 13.0 in the scripts. lattice_beam=18.0 # this has most effect on size of the lattices. -# We search for optimal WER in low LMWTs: -scoring_opts="--min-lmwt 2 --max-lmwt 10" # search acoustic scale in larger values diff --git a/egs/rm/s5/local/chain/run_tdnn_5f.sh b/egs/rm/s5/local/chain/run_tdnn_5f.sh new file mode 100644 index 00000000000..0379d16fe13 --- /dev/null +++ b/egs/rm/s5/local/chain/run_tdnn_5f.sh @@ -0,0 +1,131 @@ +#!/bin/bash + +# this script is a modified version of swbd/run_tdnn_5f.sh + +set -e + +# configs for 'chain' +stage=0 +train_stage=-10 +get_egs_stage=-10 +dir=exp/chain/tdnn_5f + +# training options +num_epochs=12 +initial_effective_lrate=0.005 +final_effective_lrate=0.0005 +leftmost_questions_truncate=-1 +max_param_change=2.0 +final_layer_normalize_target=0.5 +num_jobs_initial=2 +num_jobs_final=4 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 6 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 1200 data/train $lang $ali_dir $treedir +fi + +if [ $stage -le 7 ]; then + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --xent-regularize 0.1 \ + --leaky-hmm-coefficient 0.1 \ + --l2-regularize 0.00005 \ + --jesus-opts "--jesus-forward-input-dim 200 --jesus-forward-output-dim 500 --jesus-hidden-dim 2000 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \ + --splice-indexes "-1,0,1 -2,-1,0,1 -3,0,3 -6,-3,0" \ + --apply-deriv-weights false \ + --frames-per-iter 1000000 \ + --lm-opts "--num-extra-lm-states=200" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --feat-type raw \ + --online-ivector-dir exp/nnet2_online/ivectors \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/train $treedir exp/tri3b_lats $dir || exit 1; +fi + +if [ $stage -le 8 ]; then + steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 4 \ + data/test exp/nnet2_online/extractor exp/nnet2_online/ivectors_test || exit 1; +fi + +if [ $stage -le 9 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang $dir $dir/graph + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context 20 --scoring-opts "--min-lmwt 1" \ + --nj 20 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet2_online/ivectors_test \ + $dir/graph data/test $dir/decode || exit 1; +fi + +if [ $stage -le 10 ]; then + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_ug $dir $dir/graph_ug + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context 20 \ + --nj 20 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet2_online/ivectors_test \ + $dir/graph_ug data/test $dir/decode_ug || exit 1; +fi +wait; +exit 0; diff --git a/egs/rm/s5/local/nnet/run_autoencoder.sh b/egs/rm/s5/local/nnet/run_autoencoder.sh index 2ee4b19bf80..c05792a936b 100755 --- a/egs/rm/s5/local/nnet/run_autoencoder.sh +++ b/egs/rm/s5/local/nnet/run_autoencoder.sh @@ -1,8 +1,16 @@ #!/bin/bash +# Copyright 2012-2014 Brno University of Technology (Author: Karel Vesely) +# Apache 2.0 + +# This example shows how to train a simple autoencoder network. +# We use , little different training hyperparameters and MSE objective. + . path.sh . cmd.sh +set -eu + # Train, dir=exp/autoencoder data_fmllr=data-fmllr-tri3b diff --git a/egs/rm/s5/local/nnet/run_blocksoftmax.sh b/egs/rm/s5/local/nnet/run_blocksoftmax.sh index a1de4d433ca..175a6021778 100755 --- a/egs/rm/s5/local/nnet/run_blocksoftmax.sh +++ b/egs/rm/s5/local/nnet/run_blocksoftmax.sh @@ -28,17 +28,14 @@ wsj_ali=../../wsj/s5/exp/tri4b_ali_si284 stage=0 . utils/parse_options.sh || exit 1; -set -u -set -e -set -o pipefail -set -x +set -euxo pipefail # Make the FBANK features, -if [ $stage -le 0 ]; then +[ ! -e $dev ] && if [ $stage -le 0 ]; then # Make datadir copies, - utils/copy_data_dir.sh $dev_original $dev; rm $dev/{cmvn,feats}.scp 2>/dev/null - utils/copy_data_dir.sh $train_original $train; rm $train/{cmvn,feats}.scp 2>/dev/null - utils/copy_data_dir.sh --utt-prefix wsj_ --spk-prefix wsj_ $wsj_original $wsj; rm $wsj/{cmvn,feats}.scp 2>/dev/null + utils/copy_data_dir.sh $dev_original $dev; rm $dev/{cmvn,feats}.scp + utils/copy_data_dir.sh $train_original $train; rm $train/{cmvn,feats}.scp + utils/copy_data_dir.sh --utt-prefix wsj --spk-prefix wsj $wsj_original $wsj; rm $wsj/{cmvn,feats}.scp # Feature extraction, # Dev set, @@ -46,11 +43,11 @@ if [ $stage -le 0 ]; then $dev $dev/log $dev/data steps/compute_cmvn_stats.sh $dev $dev/log $dev/data # Training set, - steps/make_fbank_pitch.sh --nj 10 --cmd "$train_cmd -tc 10" \ + steps/make_fbank_pitch.sh --nj 10 --cmd "$train_cmd --max-jobs-run 10" \ $train $train/log $train/data steps/compute_cmvn_stats.sh $train $train/log $train/data # Wsj, - steps/make_fbank_pitch.sh --nj 10 --cmd "$train_cmd -tc 10" \ + steps/make_fbank_pitch.sh --nj 10 --cmd "$train_cmd --max-jobs-run 10" \ $wsj $wsj/log $wsj/data steps/compute_cmvn_stats.sh $wsj $wsj/log $wsj/data diff --git a/egs/rm/s5/local/nnet/run_blstm.sh b/egs/rm/s5/local/nnet/run_blstm.sh index 25dc7dcb455..c9db65f738e 100755 --- a/egs/rm/s5/local/nnet/run_blstm.sh +++ b/egs/rm/s5/local/nnet/run_blstm.sh @@ -12,7 +12,9 @@ # # A more sensible approach should be single-stream training, # and per-utterance updates. But the results were worse. -# + +# Note: With DNNs in RM, the optimal LMWT is 2-6. Don't be tempted to try acwt's like 0.2, +# the value 0.1 is better both for decoding and sMBR. . ./cmd.sh . ./path.sh @@ -28,6 +30,8 @@ gmm=exp/tri3b stage=0 . utils/parse_options.sh || exit 1; +set -eu + # Make the FBANK features [ ! -e $dev ] && if [ $stage -le 0 ]; then # Dev set @@ -37,7 +41,7 @@ stage=0 steps/compute_cmvn_stats.sh $dev $dev/log $dev/data || exit 1; # Training set utils/copy_data_dir.sh $train_original $train || exit 1; rm $train/{cmvn,feats}.scp - steps/make_fbank_pitch.sh --nj 10 --cmd "$train_cmd -tc 10" \ + steps/make_fbank_pitch.sh --nj 10 --cmd "$train_cmd --max-jobs-run 10" \ $train $train/log $train/data || exit 1; steps/compute_cmvn_stats.sh $train $train/log $train/data || exit 1; # Split the training set @@ -46,7 +50,7 @@ fi if [ $stage -le 1 ]; then # Train the DNN optimizing per-frame cross-entropy. - dir=exp/blstm4g + dir=exp/blstm4i ali=${gmm}_ali # Train @@ -61,8 +65,6 @@ if [ $stage -le 1 ]; then # Decode (reuse HCLG graph) steps/nnet/decode.sh --nj 20 --cmd "$decode_cmd" --config conf/decode_dnn.config --acwt 0.1 \ $gmm/graph $dev $dir/decode || exit 1; - steps/nnet/decode.sh --nj 20 --cmd "$decode_cmd" --config conf/decode_dnn.config --acwt 0.1 \ - $gmm/graph_ug $dev $dir/decode_ug || exit 1; fi # TODO : sequence training, diff --git a/egs/rm/s5/local/nnet/run_cnn.sh b/egs/rm/s5/local/nnet/run_cnn.sh index c6a5ee209c2..8c5730a1c85 100755 --- a/egs/rm/s5/local/nnet/run_cnn.sh +++ b/egs/rm/s5/local/nnet/run_cnn.sh @@ -1,5 +1,15 @@ #!/bin/bash +# Copyright 2012-2015 Brno University of Technology (Author: Karel Vesely) +# Apache 2.0 + +# This example shows how to build CNN with convolution along frequency axis. +# First we train CNN, then build RBMs on top, then do train per-frame training +# and sequence-discriminative training. + +# Note: With DNNs in RM, the optimal LMWT is 2-6. Don't be tempted to try acwt's like 0.2, +# the value 0.1 is better both for decoding and sMBR. + . ./cmd.sh ## You'll want to change cmd.sh to something that will work on your system. ## This relates to the queue. @@ -16,9 +26,10 @@ gmm=exp/tri3b stage=0 . utils/parse_options.sh +set -euxo pipefail # Make the FBANK features, -if [ $stage -le 0 ]; then +[ ! -e $dev ] && if [ $stage -le 0 ]; then # Dev set utils/copy_data_dir.sh $dev_original $dev || exit 1; rm $dev/{cmvn,feats}.scp steps/make_fbank_pitch.sh --nj 10 --cmd "$train_cmd" \ @@ -34,6 +45,7 @@ if [ $stage -le 0 ]; then fi # Run the CNN pre-training, +hid_layers=2 if [ $stage -le 1 ]; then dir=exp/cnn4c ali=${gmm}_ali @@ -43,17 +55,23 @@ if [ $stage -le 1 ]; then --cmvn-opts "--norm-means=true --norm-vars=true" \ --delta-opts "--delta-order=2" --splice 5 \ --network-type cnn1d --cnn-proto-opts "--patch-dim1 8 --pitch-dim 3" \ - --hid-layers 2 --learn-rate 0.008 \ + --hid-layers $hid_layers --learn-rate 0.008 \ ${train}_tr90 ${train}_cv10 data/lang $ali $ali $dir || exit 1; - # Decode - steps/nnet/decode.sh --nj 20 --cmd "$decode_cmd" --config conf/decode_dnn.config --acwt 0.2 \ + # Decode, + steps/nnet/decode.sh --nj 20 --cmd "$decode_cmd" --config conf/decode_dnn.config --acwt 0.1 \ $gmm/graph $dev $dir/decode || exit 1; - steps/nnet/decode.sh --nj 20 --cmd "$decode_cmd" --config conf/decode_dnn.config --acwt 0.2 \ - $gmm/graph_ug $dev $dir/decode_ug || exit 1; fi -# Pre-train stack of RBMs on top of the convolutional layers (4 layers, 1024 units), if [ $stage -le 2 ]; then + # Concat 'feature_transform' with convolutional layers, + dir=exp/cnn4c + nnet-concat $dir/final.feature_transform \ + "nnet-copy --remove-last-components=$(((hid_layers+1)*2)) $dir/final.nnet - |" \ + $dir/final.feature_transform_cnn +fi + +# Pre-train stack of RBMs on top of the convolutional layers (4 layers, 1024 units), +if [ $stage -le 3 ]; then dir=exp/cnn4c_pretrain-dbn transf_cnn=exp/cnn4c/final.feature_transform_cnn # transform with convolutional layers # Train @@ -65,14 +83,14 @@ if [ $stage -le 2 ]; then fi # Re-align using CNN, -if [ $stage -le 3 ]; then +if [ $stage -le 4 ]; then dir=exp/cnn4c steps/nnet/align.sh --nj 20 --cmd "$train_cmd" \ $train data/lang $dir ${dir}_ali || exit 1 fi # Train the DNN optimizing cross-entropy, -if [ $stage -le 4 ]; then +if [ $stage -le 5 ]; then dir=exp/cnn4c_pretrain-dbn_dnn; [ ! -d $dir ] && mkdir -p $dir/log; ali=exp/cnn4c_ali feature_transform=exp/cnn4c/final.feature_transform @@ -81,7 +99,7 @@ if [ $stage -le 4 ]; then cnn_dbn=$dir/cnn_dbn.nnet { # Concatenate CNN layers and DBN, num_components=$(nnet-info $feature_transform | grep -m1 num-components | awk '{print $2;}') - cnn="nnet-copy --remove-first-layers=$num_components $feature_transform_dbn - |" + cnn="nnet-copy --remove-first-components=$num_components $feature_transform_dbn - |" nnet-concat "$cnn" $dbn $cnn_dbn 2>$dir/log/concat_cnn_dbn.log || exit 1 } # Train @@ -89,20 +107,20 @@ if [ $stage -le 4 ]; then steps/nnet/train.sh --feature-transform $feature_transform --dbn $cnn_dbn --hid-layers 0 \ ${train}_tr90 ${train}_cv10 data/lang $ali $ali $dir || exit 1; # Decode (reuse HCLG graph) - steps/nnet/decode.sh --nj 20 --cmd "$decode_cmd" --config conf/decode_dnn.config --acwt 0.2 \ + steps/nnet/decode.sh --nj 20 --cmd "$decode_cmd" --config conf/decode_dnn.config --acwt 0.1 \ $gmm/graph $dev $dir/decode || exit 1; - steps/nnet/decode.sh --nj 20 --cmd "$decode_cmd" --config conf/decode_dnn.config --acwt 0.2 \ - $gmm/graph_ug $dev $dir/decode_ug || exit 1; fi -# Sequence training using sMBR criterion, we do Stochastic-GD -# with per-utterance updates. For RM good acwt is 0.2, + +# Sequence training using sMBR criterion, we do Stochastic-GD with per-utterance updates. +# Note: With DNNs in RM, the optimal LMWT is 2-6. Don't be tempted to try acwt's like 0.2, +# the value 0.1 is better both for decoding and sMBR. dir=exp/cnn4c_pretrain-dbn_dnn_smbr srcdir=exp/cnn4c_pretrain-dbn_dnn -acwt=0.2 +acwt=0.1 # First we generate lattices and alignments, -if [ $stage -le 4 ]; then +if [ $stage -le 6 ]; then steps/nnet/align.sh --nj 20 --cmd "$train_cmd" \ $train data/lang $srcdir ${srcdir}_ali || exit 1; steps/nnet/make_denlats.sh --nj 20 --cmd "$decode_cmd" --config conf/decode_dnn.config --acwt $acwt \ @@ -110,11 +128,11 @@ if [ $stage -le 4 ]; then fi # Re-train the DNN by 6 iterations of sMBR, -if [ $stage -le 5 ]; then +if [ $stage -le 7 ]; then steps/nnet/train_mpe.sh --cmd "$cuda_cmd" --num-iters 6 --acwt $acwt --do-smbr true \ $train data/lang $srcdir ${srcdir}_ali ${srcdir}_denlats $dir || exit 1 # Decode - for ITER in 1 2 3 4 5 6; do + for ITER in 1 3 6; do steps/nnet/decode.sh --nj 20 --cmd "$decode_cmd" --config conf/decode_dnn.config \ --nnet $dir/${ITER}.nnet --acwt $acwt \ $gmm/graph $dev $dir/decode_it${ITER} || exit 1 diff --git a/egs/rm/s5/local/nnet/run_cnn2d.sh b/egs/rm/s5/local/nnet/run_cnn2d.sh index ac69074cf6e..be17bce7a57 100755 --- a/egs/rm/s5/local/nnet/run_cnn2d.sh +++ b/egs/rm/s5/local/nnet/run_cnn2d.sh @@ -1,10 +1,23 @@ #!/bin/bash +# Copyright 2012-2015 Brno University of Technology (Author: Karel Vesely) +# Apache 2.0 + +# This example shows how to build CNN with 2D convolution along both frequency +# and time axis. First we train CNN, then build RBMs on top, then do train +# per-frame training and sequence-discriminative training. + +# Note: With DNNs in RM, the optimal LMWT is 2-6. Don't be tempted to try acwt's like 0.2, +# the value 0.1 is better both for decoding and sMBR. + . ./cmd.sh ## You'll want to change cmd.sh to something that will work on your system. ## This relates to the queue. . ./path.sh ## Source the tools/utils (import the queue.pl) +# Note: With DNNs in RM, the optimal LMWT is 2-6. Don't be tempted to try acwt's like 0.2, +# the value 0.1 is better both for decoding and sMBR. + dev=data-fbank/test train=data-fbank/train @@ -16,9 +29,10 @@ gmm=exp/tri3b stage=0 . utils/parse_options.sh +set -eu # Make the FBANK features, -if [ $stage -le 0 ]; then +[ ! -e $dev ] && if [ $stage -le 0 ]; then # Dev set utils/copy_data_dir.sh $dev_original $dev || exit 1; rm $dev/{cmvn,feats}.scp steps/make_fbank_pitch.sh --nj 10 --cmd "$train_cmd" \ @@ -35,7 +49,7 @@ fi # Run the CNN pre-training, if [ $stage -le 1 ]; then - dir=exp/cnn2d4c + dir=exp/cnn4g-2D ali=${gmm}_ali # Train $cuda_cmd $dir/log/train_nnet.log \ @@ -46,16 +60,14 @@ if [ $stage -le 1 ]; then --hid-layers 2 --learn-rate 0.008 \ ${train}_tr90 ${train}_cv10 data/lang $ali $ali $dir || exit 1; # Decode - steps/nnet/decode.sh --nj 20 --cmd "$decode_cmd" --config conf/decode_dnn.config --acwt 0.2 \ + steps/nnet/decode.sh --nj 20 --cmd "$decode_cmd" --config conf/decode_dnn.config --acwt 0.1 \ $gmm/graph $dev $dir/decode || exit 1; - steps/nnet/decode.sh --nj 20 --cmd "$decode_cmd" --config conf/decode_dnn.config --acwt 0.2 \ - $gmm/graph_ug $dev $dir/decode_ug || exit 1; fi # Pre-train stack of RBMs on top of the convolutional layers (4 layers, 1024 units), if [ $stage -le 2 ]; then - dir=exp/cnn2d4c_pretrain-dbn - transf_cnn=exp/cnn2d4c/final.feature_transform_cnn # transform with convolutional layers + dir=exp/cnn4g-2D_pretrain-dbn + transf_cnn=exp/cnn4g-2D/final.feature_transform_cnn # transform with convolutional layers # Train $cuda_cmd $dir/log/pretrain_dbn.log \ steps/nnet/pretrain_dbn.sh --nn-depth 4 --hid-dim 1024 --rbm-iter 20 \ @@ -66,22 +78,22 @@ fi # Re-align using CNN, if [ $stage -le 3 ]; then - dir=exp/cnn2d4c + dir=exp/cnn4g-2D steps/nnet/align.sh --nj 20 --cmd "$train_cmd" \ $train data/lang $dir ${dir}_ali || exit 1 fi # Train the DNN optimizing cross-entropy, if [ $stage -le 4 ]; then - dir=exp/cnn2d4c_pretrain-dbn_dnn; [ ! -d $dir ] && mkdir -p $dir/log; - ali=exp/cnn2d4c_ali - feature_transform=exp/cnn2d4c/final.feature_transform - feature_transform_dbn=exp/cnn2d4c_pretrain-dbn/final.feature_transform - dbn=exp/cnn2d4c_pretrain-dbn/4.dbn + dir=exp/cnn4g-2D_pretrain-dbn_dnn; [ ! -d $dir ] && mkdir -p $dir/log; + ali=exp/cnn4g-2D_ali + feature_transform=exp/cnn4g-2D/final.feature_transform + feature_transform_dbn=exp/cnn4g-2D_pretrain-dbn/final.feature_transform + dbn=exp/cnn4g-2D_pretrain-dbn/4.dbn cnn_dbn=$dir/cnn_dbn.nnet { # Concatenate CNN layers and DBN, num_components=$(nnet-info $feature_transform | grep -m1 num-components | awk '{print $2;}') - cnn="nnet-copy --remove-first-layers=$num_components $feature_transform_dbn - |" + cnn="nnet-copy --remove-first-components=$num_components $feature_transform_dbn - |" nnet-concat "$cnn" $dbn $cnn_dbn 2>$dir/log/concat_cnn_dbn.log || exit 1 } # Train @@ -89,17 +101,17 @@ if [ $stage -le 4 ]; then steps/nnet/train.sh --feature-transform $feature_transform --dbn $cnn_dbn --hid-layers 0 \ ${train}_tr90 ${train}_cv10 data/lang $ali $ali $dir || exit 1; # Decode (reuse HCLG graph) - steps/nnet/decode.sh --nj 20 --cmd "$decode_cmd" --config conf/decode_dnn.config --acwt 0.2 \ + steps/nnet/decode.sh --nj 20 --cmd "$decode_cmd" --config conf/decode_dnn.config --acwt 0.1 \ $gmm/graph $dev $dir/decode || exit 1; - steps/nnet/decode.sh --nj 20 --cmd "$decode_cmd" --config conf/decode_dnn.config --acwt 0.2 \ - $gmm/graph_ug $dev $dir/decode_ug || exit 1; fi -# Sequence training using sMBR criterion, we do Stochastic-GD -# with per-utterance updates. For RM good acwt is 0.2, -dir=exp/cnn2d4c_pretrain-dbn_dnn_smbr -srcdir=exp/cnn2d4c_pretrain-dbn_dnn -acwt=0.2 + +# Sequence training using sMBR criterion, we do Stochastic-GD with per-utterance updates. +# Note: With DNNs in RM, the optimal LMWT is 2-6. Don't be tempted to try acwt's like 0.2, +# the value 0.1 is better both for decoding and sMBR. +dir=exp/cnn4g-2D_pretrain-dbn_dnn_smbr +srcdir=exp/cnn4g-2D_pretrain-dbn_dnn +acwt=0.1 # First we generate lattices and alignments, if [ $stage -le 4 ]; then @@ -114,7 +126,7 @@ if [ $stage -le 5 ]; then steps/nnet/train_mpe.sh --cmd "$cuda_cmd" --num-iters 6 --acwt $acwt --do-smbr true \ $train data/lang $srcdir ${srcdir}_ali ${srcdir}_denlats $dir || exit 1 # Decode - for ITER in 1 2 3 4 5 6; do + for ITER in 6 3 1; do steps/nnet/decode.sh --nj 20 --cmd "$decode_cmd" --config conf/decode_dnn.config \ --nnet $dir/${ITER}.nnet --acwt $acwt \ $gmm/graph $dev $dir/decode_it${ITER} || exit 1 diff --git a/egs/rm/s5/local/nnet/run_dnn.sh b/egs/rm/s5/local/nnet/run_dnn.sh index c30d93a7861..c2ba26970ad 100755 --- a/egs/rm/s5/local/nnet/run_dnn.sh +++ b/egs/rm/s5/local/nnet/run_dnn.sh @@ -15,41 +15,45 @@ # the objective is to emphasize state-sequences with better # frame accuracy w.r.t. reference alignment. +# Note: With DNNs in RM, the optimal LMWT is 2-6. Don't be tempted to try acwt's like 0.2, +# the value 0.1 is better both for decoding and sMBR. + . ./cmd.sh ## You'll want to change cmd.sh to something that will work on your system. ## This relates to the queue. . ./path.sh ## Source the tools/utils (import the queue.pl) +set -eu + # Config: gmm=exp/tri3b data_fmllr=data-fmllr-tri3b stage=0 # resume training with --stage=N # End of config. -. utils/parse_options.sh || exit 1; +. utils/parse_options.sh # -if [ $stage -le 0 ]; then +[ ! -e $data_fmllr/test ] && if [ $stage -le 0 ]; then # Store fMLLR features, so we can train on them easily, # test dir=$data_fmllr/test steps/nnet/make_fmllr_feats.sh --nj 10 --cmd "$train_cmd" \ --transform-dir $gmm/decode \ - $dir data/test $gmm $dir/log $dir/data || exit 1 + $dir data/test $gmm $dir/log $dir/data # train dir=$data_fmllr/train steps/nnet/make_fmllr_feats.sh --nj 10 --cmd "$train_cmd" \ --transform-dir ${gmm}_ali \ - $dir data/train $gmm $dir/log $dir/data || exit 1 + $dir data/train $gmm $dir/log $dir/data # split the data : 90% train 10% cross-validation (held-out) - utils/subset_data_dir_tr_cv.sh $dir ${dir}_tr90 ${dir}_cv10 || exit 1 + utils/subset_data_dir_tr_cv.sh $dir ${dir}_tr90 ${dir}_cv10 fi if [ $stage -le 1 ]; then # Pre-train DBN, i.e. a stack of RBMs (small database, smaller DNN) dir=exp/dnn4b_pretrain-dbn - (tail --pid=$$ -F $dir/log/pretrain_dbn.log 2>/dev/null)& # forward log $cuda_cmd $dir/log/pretrain_dbn.log \ - steps/nnet/pretrain_dbn.sh --hid-dim 1024 --rbm-iter 20 $data_fmllr/train $dir || exit 1; + steps/nnet/pretrain_dbn.sh --hid-dim 1024 --rbm-iter 20 $data_fmllr/train $dir fi if [ $stage -le 2 ]; then @@ -58,42 +62,42 @@ if [ $stage -le 2 ]; then ali=${gmm}_ali feature_transform=exp/dnn4b_pretrain-dbn/final.feature_transform dbn=exp/dnn4b_pretrain-dbn/6.dbn - (tail --pid=$$ -F $dir/log/train_nnet.log 2>/dev/null)& # forward log # Train $cuda_cmd $dir/log/train_nnet.log \ steps/nnet/train.sh --feature-transform $feature_transform --dbn $dbn --hid-layers 0 --learn-rate 0.008 \ - $data_fmllr/train_tr90 $data_fmllr/train_cv10 data/lang $ali $ali $dir || exit 1; + $data_fmllr/train_tr90 $data_fmllr/train_cv10 data/lang $ali $ali $dir # Decode (reuse HCLG graph) - steps/nnet/decode.sh --nj 20 --cmd "$decode_cmd" --config conf/decode_dnn.config --acwt 0.2 \ - $gmm/graph $data_fmllr/test $dir/decode || exit 1; - steps/nnet/decode.sh --nj 20 --cmd "$decode_cmd" --config conf/decode_dnn.config --acwt 0.2 \ - $gmm/graph_ug $data_fmllr/test $dir/decode_ug || exit 1; + steps/nnet/decode.sh --nj 20 --cmd "$decode_cmd" --config conf/decode_dnn.config --acwt 0.1 \ + $gmm/graph $data_fmllr/test $dir/decode + steps/nnet/decode.sh --nj 20 --cmd "$decode_cmd" --config conf/decode_dnn.config --acwt 0.1 \ + $gmm/graph_ug $data_fmllr/test $dir/decode_ug fi -# Sequence training using sMBR criterion, we do Stochastic-GD -# with per-utterance updates. For RM good acwt is 0.2 +# Sequence training using sMBR criterion, we do Stochastic-GD with per-utterance updates. +# Note: With DNNs in RM, the optimal LMWT is 2-6. Don't be tempted to try acwt's like 0.2, +# the value 0.1 is better both for decoding and sMBR. dir=exp/dnn4b_pretrain-dbn_dnn_smbr srcdir=exp/dnn4b_pretrain-dbn_dnn -acwt=0.2 +acwt=0.1 if [ $stage -le 3 ]; then # First we generate lattices and alignments: steps/nnet/align.sh --nj 20 --cmd "$train_cmd" \ - $data_fmllr/train data/lang $srcdir ${srcdir}_ali || exit 1; + $data_fmllr/train data/lang $srcdir ${srcdir}_ali steps/nnet/make_denlats.sh --nj 20 --cmd "$decode_cmd" --config conf/decode_dnn.config --acwt $acwt \ - $data_fmllr/train data/lang $srcdir ${srcdir}_denlats || exit 1; + $data_fmllr/train data/lang $srcdir ${srcdir}_denlats fi if [ $stage -le 4 ]; then # Re-train the DNN by 6 iterations of sMBR steps/nnet/train_mpe.sh --cmd "$cuda_cmd" --num-iters 6 --acwt $acwt --do-smbr true \ - $data_fmllr/train data/lang $srcdir ${srcdir}_ali ${srcdir}_denlats $dir || exit 1 + $data_fmllr/train data/lang $srcdir ${srcdir}_ali ${srcdir}_denlats $dir # Decode - for ITER in 1 2 3 4 5 6; do + for ITER in 6 3 1; do steps/nnet/decode.sh --nj 20 --cmd "$decode_cmd" --config conf/decode_dnn.config \ --nnet $dir/${ITER}.nnet --acwt $acwt \ - $gmm/graph $data_fmllr/test $dir/decode_it${ITER} || exit 1 + $gmm/graph $data_fmllr/test $dir/decode_it${ITER} done fi diff --git a/egs/rm/s5/local/nnet/run_dnn_fbank.sh b/egs/rm/s5/local/nnet/run_dnn_fbank.sh index 1d736c2603b..4671381d3d3 100755 --- a/egs/rm/s5/local/nnet/run_dnn_fbank.sh +++ b/egs/rm/s5/local/nnet/run_dnn_fbank.sh @@ -15,6 +15,9 @@ # the objective is to emphasize state-sequences with better # frame accuracy w.r.t. reference alignment. +# Note: With DNNs in RM, the optimal LMWT is 2-6. Don't be tempted to try acwt's like 0.2, +# the value 0.1 is better both for decoding and sMBR. + . ./cmd.sh ## You'll want to change cmd.sh to something that will work on your system. ## This relates to the queue. @@ -31,8 +34,10 @@ gmm=exp/tri3b stage=0 . utils/parse_options.sh || exit 1; +set -eu + # Make the FBANK features -if [ $stage -le 0 ]; then +[ ! -e $dev ] && if [ $stage -le 0 ]; then # Dev set utils/copy_data_dir.sh $dev_original $dev || exit 1; rm $dev/{cmvn,feats}.scp steps/make_fbank_pitch.sh --nj 10 --cmd "$train_cmd" \ @@ -40,7 +45,7 @@ if [ $stage -le 0 ]; then steps/compute_cmvn_stats.sh $dev $dev/log $dev/data || exit 1; # Training set utils/copy_data_dir.sh $train_original $train || exit 1; rm $train/{cmvn,feats}.scp - steps/make_fbank_pitch.sh --nj 10 --cmd "$train_cmd -tc 10" \ + steps/make_fbank_pitch.sh --nj 10 --cmd "$train_cmd --max-jobs-run 10" \ $train $train/log $train/data || exit 1; steps/compute_cmvn_stats.sh $train $train/log $train/data || exit 1; # Split the training set @@ -70,13 +75,12 @@ if [ $stage -le 2 ]; then # Decode (reuse HCLG graph) steps/nnet/decode.sh --nj 20 --cmd "$decode_cmd" --config conf/decode_dnn.config --acwt 0.1 \ $gmm/graph $dev $dir/decode || exit 1; - steps/nnet/decode.sh --nj 20 --cmd "$decode_cmd" --config conf/decode_dnn.config --acwt 0.1 \ - $gmm/graph_ug $dev $dir/decode_ug || exit 1; fi -# Sequence training using sMBR criterion, we do Stochastic-GD -# with per-utterance updates. We use usually good acwt 0.1 +# Sequence training using sMBR criterion, we do Stochastic-GD with per-utterance updates. +# Note: With DNNs in RM, the optimal LMWT is 2-6. Don't be tempted to try acwt's like 0.2, +# the value 0.1 is better both for decoding and sMBR. dir=exp/dnn4d-fbank_pretrain-dbn_dnn_smbr srcdir=exp/dnn4d-fbank_pretrain-dbn_dnn acwt=0.1 @@ -94,7 +98,7 @@ if [ $stage -le 4 ]; then steps/nnet/train_mpe.sh --cmd "$cuda_cmd" --num-iters 6 --acwt $acwt --do-smbr true \ $train data/lang $srcdir ${srcdir}_ali ${srcdir}_denlats $dir || exit 1 # Decode - for ITER in 1 2 3 4 5 6; do + for ITER in 6 3 1; do steps/nnet/decode.sh --nj 20 --cmd "$decode_cmd" --config conf/decode_dnn.config \ --nnet $dir/${ITER}.nnet --acwt $acwt \ $gmm/graph $dev $dir/decode_it${ITER} || exit 1 diff --git a/egs/rm/s5/local/nnet/run_dummy_ivec.sh b/egs/rm/s5/local/nnet/run_dummy_ivec.sh new file mode 100755 index 00000000000..860f209c2a0 --- /dev/null +++ b/egs/rm/s5/local/nnet/run_dummy_ivec.sh @@ -0,0 +1,140 @@ +#!/bin/bash + +# Copyright 2015 Brno University of Technology (Author: Karel Vesely) +# Apache 2.0 + +# This example demonstrates how to add i-vector on DNN input (or any other side-info). +# A fixed vector is pasted to all the frames of an utterance and forwarded to nn-input `as-is', +# bypassing both the feaure transform and global CMVN normalization. +# +# The i-vector is simulated by a dummy vector [ 0 0 0 ], +# note that all the scripts get an extra option '--ivector' +# +# First we train NN with w/o RBM pre-training, then we do the full recipe: +# RBM pre-training, per-frame training, and sequence-discriminative training. + +# Note: With DNNs in RM, the optimal LMWT is 2-6. Don't be tempted to try acwt's like 0.2, +# the value 0.1 is better both for decoding and sMBR. + +. ./cmd.sh ## You'll want to change cmd.sh to something that will work on your system. + ## This relates to the queue. + +. ./path.sh ## Source the tools/utils (import the queue.pl) + +dev=data-fbank/test +train=data-fbank/train + +dev_original=data/test +train_original=data/train + +gmm=exp/tri3b + +stage=0 +. utils/parse_options.sh + +set -uexo pipefail + +# Make the FBANK features +[ ! -e $dev ] && if [ $stage -le 0 ]; then + # Dev set + utils/copy_data_dir.sh $dev_original $dev rm $dev/{cmvn,feats}.scp + steps/make_fbank_pitch.sh --nj 10 --cmd "$train_cmd" \ + $dev $dev/log $dev/data + steps/compute_cmvn_stats.sh $dev $dev/log $dev/data + # Training set + utils/copy_data_dir.sh $train_original $train rm $train/{cmvn,feats}.scp + steps/make_fbank_pitch.sh --nj 10 --cmd "$train_cmd --max-jobs-run 10" \ + $train $train/log $train/data + steps/compute_cmvn_stats.sh $train $train/log $train/data + # Split the training set + utils/subset_data_dir_tr_cv.sh --cv-spk-percent 10 $train ${train}_tr90 ${train}_cv10 +fi + +# Create ark with dummy-ivectors, +[ ! -e data/dummy_ivec.ark ] && cat {$train,$dev}/feats.scp | awk '{ print $1, "[ 0 0 0 ]"; }' >data/dummy_ivec.ark +ivector=ark:data/dummy_ivec.ark + +# 1) Build NN, no pre-training (script test), +if [ $stage -le 1 ]; then + # Train the DNN optimizing per-frame cross-entropy. + dir=exp/dnn4h-dummy-ivec + ali=${gmm}_ali + # Train + $cuda_cmd $dir/log/train_nnet.log \ + steps/nnet/train.sh --hid-layers 4 --hid-dim 1024 --learn-rate 0.008 \ + --ivector $ivector \ + --cmvn-opts "--norm-means=true --norm-vars=true" \ + --delta-opts "--delta-order=2" --splice 5 \ + ${train}_tr90 ${train}_cv10 data/lang $ali $ali $dir + # Decode (reuse HCLG graph) + steps/nnet/decode.sh --nj 20 --cmd "$decode_cmd" --config conf/decode_dnn.config --acwt 0.1 \ + --ivector $ivector \ + $gmm/graph $dev $dir/decode +fi + +# 2) Build NN, with pre-training (script test), +if [ $stage -le 2 ]; then + # Pre-train DBN, i.e. a stack of RBMs (small database, smaller DNN) + dir=exp/dnn4h-dummy-ivec_pretrain-dbn + $cuda_cmd $dir/log/pretrain_dbn.log \ + steps/nnet/pretrain_dbn.sh \ + --ivector $ivector \ + --cmvn-opts "--norm-means=true --norm-vars=true" \ + --delta-opts "--delta-order=2" --splice 5 \ + --hid-dim 1024 --rbm-iter 20 $train $dir +fi + +if [ $stage -le 3 ]; then + # Train the DNN optimizing per-frame cross-entropy. + dir=exp/dnn4h-dummy-ivec_pretrain-dbn_dnn + ali=${gmm}_ali + feature_transform=exp/dnn4h-dummy-ivec_pretrain-dbn/final.feature_transform + dbn=exp/dnn4h-dummy-ivec_pretrain-dbn/6.dbn + # Train + $cuda_cmd $dir/log/train_nnet.log \ + steps/nnet/train.sh --feature-transform $feature_transform --dbn $dbn --hid-layers 0 --learn-rate 0.008 \ + --ivector $ivector \ + ${train}_tr90 ${train}_cv10 data/lang $ali $ali $dir + # Decode (reuse HCLG graph) + steps/nnet/decode.sh --nj 20 --cmd "$decode_cmd" --config conf/decode_dnn.config --acwt 0.1 \ + --ivector $ivector \ + $gmm/graph $dev $dir/decode +fi + + +# Sequence training using sMBR criterion, we do Stochastic-GD with per-utterance updates. +# Note: With DNNs in RM, the optimal LMWT is 2-6. Don't be tempted to try acwt's like 0.2, +# the value 0.1 is better both for decoding and sMBR. +dir=exp/dnn4h-dummy-ivec_pretrain-dbn_dnn_smbr +srcdir=exp/dnn4h-dummy-ivec_pretrain-dbn_dnn +acwt=0.1 + +if [ $stage -le 4 ]; then + # First we generate lattices and alignments: + steps/nnet/align.sh --nj 20 --cmd "$train_cmd" \ + --ivector $ivector \ + $train data/lang $srcdir ${srcdir}_ali + steps/nnet/make_denlats.sh --nj 20 --cmd "$decode_cmd" --config conf/decode_dnn.config --acwt $acwt \ + --ivector $ivector \ + $train data/lang $srcdir ${srcdir}_denlats +fi + +if [ $stage -le 5 ]; then + # Re-train the DNN by 6 iterations of sMBR + steps/nnet/train_mpe.sh --cmd "$cuda_cmd" --num-iters 6 --acwt $acwt --do-smbr true \ + --ivector $ivector \ + $train data/lang $srcdir ${srcdir}_ali ${srcdir}_denlats $dir || exit 1 + # Decode + for ITER in 1 3 6; do + steps/nnet/decode.sh --nj 20 --cmd "$decode_cmd" --config conf/decode_dnn.config \ + --ivector $ivector \ + --nnet $dir/${ITER}.nnet --acwt $acwt \ + $gmm/graph $dev $dir/decode_it${ITER} || exit 1 + done +fi + +echo Success +exit 0 + +# Getting results [see RESULTS file] +# for x in exp/*/decode*; do [ -d $x ] && grep WER $x/wer_* | utils/best_wer.sh; done diff --git a/egs/rm/s5/local/nnet/run_lstm.sh b/egs/rm/s5/local/nnet/run_lstm.sh index f684ea5b036..191ebbf066e 100755 --- a/egs/rm/s5/local/nnet/run_lstm.sh +++ b/egs/rm/s5/local/nnet/run_lstm.sh @@ -6,6 +6,9 @@ # This example script trains a LSTM network on FBANK features. # The LSTM code comes from Yiayu DU, and Wei Li, thanks! +# Note: With DNNs in RM, the optimal LMWT is 2-6. Don't be tempted to try acwt's like 0.2, +# the value 0.1 is better both for decoding and sMBR. + . ./cmd.sh . ./path.sh @@ -20,6 +23,8 @@ gmm=exp/tri3b stage=0 . utils/parse_options.sh || exit 1; +set -eu + # Make the FBANK features [ ! -e $dev ] && if [ $stage -le 0 ]; then # Dev set @@ -29,7 +34,7 @@ stage=0 steps/compute_cmvn_stats.sh $dev $dev/log $dev/data || exit 1; # Training set utils/copy_data_dir.sh $train_original $train || exit 1; rm $train/{cmvn,feats}.scp - steps/make_fbank_pitch.sh --nj 10 --cmd "$train_cmd -tc 10" \ + steps/make_fbank_pitch.sh --nj 10 --cmd "$train_cmd --max-jobs-run 10" \ $train $train/log $train/data || exit 1; steps/compute_cmvn_stats.sh $train $train/log $train/data || exit 1; # Split the training set @@ -45,8 +50,9 @@ if [ $stage -le 1 ]; then $cuda_cmd $dir/log/train_nnet.log \ steps/nnet/train.sh --network-type lstm --learn-rate 0.0001 \ --cmvn-opts "--norm-means=true --norm-vars=true" --feat-type plain --splice 0 \ - --train-opts "--momentum 0.9 --halving-factor 0.5" \ - --train-tool "nnet-train-lstm-streams --num-stream=4 --targets-delay=5" \ + --scheduler-opts "--momentum 0.9 --halving-factor 0.5" \ + --train-tool "nnet-train-lstm-streams" \ + --train-tool-opts "--num-stream=4 --targets-delay=5" \ --proto-opts "--num-cells 512 --num-recurrent 200 --num-layers 2 --clip-gradient 5.0" \ ${train}_tr90 ${train}_cv10 data/lang $ali $ali $dir || exit 1; diff --git a/egs/rm/s5/local/nnet/run_multilingual.sh b/egs/rm/s5/local/nnet/run_multilingual.sh index cecfe09be90..126f616c34d 100755 --- a/egs/rm/s5/local/nnet/run_multilingual.sh +++ b/egs/rm/s5/local/nnet/run_multilingual.sh @@ -58,7 +58,7 @@ if [ $stage -le 0 ]; then tgt_dir=$data/${code}_$(basename $dir) utils/copy_data_dir.sh --utt-suffix _$code --spk-suffix _$code $dir $tgt_dir; rm $tgt_dir/{feats,cmvn}.scp || true # remove features, # extract features, get cmvn stats, - steps/make_fbank_pitch.sh --nj 30 --cmd "$train_cmd -tc 10" $tgt_dir{,/log,/data} + steps/make_fbank_pitch.sh --nj 30 --cmd "$train_cmd --max-jobs-run 10" $tgt_dir{,/log,/data} steps/compute_cmvn_stats.sh $tgt_dir{,/log,/data} # split lists 90% train / 10% held-out, utils/subset_data_dir_tr_cv.sh $tgt_dir ${tgt_dir}_tr90 ${tgt_dir}_cv10 @@ -89,7 +89,7 @@ objective_function="multitask$(echo ${ali_dim[@]} | tr ' ' '\n' | \ echo "Multitask objective function: $objective_function" # DNN training will be in $dir, the alignments are prepared beforehand, -dir=exp/dnn4g-multilingual${num_langs}-$(echo $lang_code_csl | tr ',' '-') +dir=exp/dnn4g-multilingual${num_langs}-$(echo $lang_code_csl | tr ',' '-')-${nnet_type} [ ! -e $dir ] && mkdir -p $dir echo "$lang_code_csl" >$dir/lang_code_csl echo "$ali_dir_csl" >$dir/ali_dir_csl @@ -119,9 +119,10 @@ fi if [ $stage -le 2 ]; then case $nnet_type in bn) + # Bottleneck network (40 dimensional bottleneck is good for fMLLR), $cuda_cmd $dir/log/train_nnet.log \ steps/nnet/train.sh --learn-rate 0.008 \ - --hid-layers 2 --hid-dim 1500 --bn-dim 80 \ + --hid-layers 2 --hid-dim 1500 --bn-dim 40 \ --cmvn-opts "--norm-means=true --norm-vars=false" \ --feat-type "traps" --splice 5 --traps-dct-basis 6 \ --labels "scp:$dir/ali-post/combined.scp" --num-tgt $output_dim \ @@ -129,7 +130,38 @@ if [ $stage -le 2 ]; then --train-tool "nnet-train-frmshuff --objective-function=$objective_function" \ ${data_tr90} ${data_cv10} lang-dummy ali-dummy ali-dummy $dir ;; + sbn) + # Stacked Bottleneck Netowork, no fMLLR in between, + bn1_dim=80 + bn2_dim=30 + # Train 1st part, + dir_part1=${dir}_part1 + $cuda_cmd ${dir}_part1/log/train_nnet.log \ + steps/nnet/train.sh --learn-rate 0.008 \ + --hid-layers 2 --hid-dim 1500 --bn-dim $bn1_dim \ + --cmvn-opts "--norm-means=true --norm-vars=false" \ + --feat-type "traps" --splice 5 --traps-dct-basis 6 \ + --labels "scp:$dir/ali-post/combined.scp" --num-tgt $output_dim \ + --proto-opts "--block-softmax-dims=${ali_dim_csl}" \ + --train-tool "nnet-train-frmshuff --objective-function=$objective_function" \ + ${data_tr90} ${data_cv10} lang-dummy ali-dummy ali-dummy $dir_part1 + # Compose feature_transform for 2nd part, + nnet-initialize <(echo " $bn1_dim $((13*bn1_dim)) -10 -5:5 10 ") \ + $dir_part1/splice_for_bottleneck.nnet + nnet-concat $dir_part1/final.feature_transform "nnet-copy --remove-last-components=4 $dir_part1/final.nnet - |" \ + $dir_part1/splice_for_bottleneck.nnet $dir_part1/final.feature_transform.part1 + # Train 2nd part, + $cuda_cmd $dir/log/train_nnet.log \ + steps/nnet/train.sh --learn-rate 0.008 \ + --feature-transform $dir_part1/final.feature_transform.part1 \ + --hid-layers 2 --hid-dim 1500 --bn-dim $bn2_dim \ + --labels "scp:$dir/ali-post/combined.scp" --num-tgt $output_dim \ + --proto-opts "--block-softmax-dims=${ali_dim_csl}" \ + --train-tool "nnet-train-frmshuff --objective-function=$objective_function" \ + ${data_tr90} ${data_cv10} lang-dummy ali-dummy ali-dummy $dir + ;; dnn_small) + # 4 hidden layers, 1024 sigmoid neurons, $cuda_cmd $dir/log/train_nnet.log \ steps/nnet/train.sh --learn-rate 0.008 \ --cmvn-opts "--norm-means=true --norm-vars=true" \ @@ -140,6 +172,7 @@ if [ $stage -le 2 ]; then ${data_tr90} ${data_cv10} lang-dummy ali-dummy ali-dummy $dir ;; dnn) + # 6 hidden layers, 2048 simgoid neurons, $cuda_cmd $dir/log/train_nnet.log \ steps/nnet/train.sh --learn-rate 0.008 \ --hid-layers 6 --hid-dim 2048 \ diff --git a/egs/rm/s5/local/online/run_nnet2_multisplice_disc.sh b/egs/rm/s5/local/online/run_nnet2_multisplice_disc.sh index 132d2c8f93f..2bddefdac04 100755 --- a/egs/rm/s5/local/online/run_nnet2_multisplice_disc.sh +++ b/egs/rm/s5/local/online/run_nnet2_multisplice_disc.sh @@ -65,9 +65,9 @@ fi if [ $stage -le 3 ]; then # I tested the following with --max-temp-archives 3 # to test other branches of the code. - # the -tc 5 limits the I/O. + # the --max-jobs-run 5 limits the I/O. steps/online/nnet2/get_egs_discriminative2.sh \ - --cmd "$decode_cmd -tc 5" \ + --cmd "$decode_cmd --max-jobs-run 5" \ --criterion $criterion --drop-frames $drop_frames \ data/train data/lang ${srcdir}{_ali,_denlats,,_degs} || exit 1; fi diff --git a/egs/rm/s5/local/online/run_nnet2_perturbed.sh b/egs/rm/s5/local/online/run_nnet2_perturbed.sh index eacb071be6e..c018ca2880b 100755 --- a/egs/rm/s5/local/online/run_nnet2_perturbed.sh +++ b/egs/rm/s5/local/online/run_nnet2_perturbed.sh @@ -95,7 +95,7 @@ if [ $stage -le 6 ]; then # dir is the neural-net training dir. utils/create_split_dir.pl /export/b0{1,2,3,4}/dpovey/kaldi-online/egs/rm/s5/$dir/egs $dir/egs/storage fi - # the -tc 15 allows more of the dump_egs jobs than the default (5), since we + # the --max-jobs-run 15 allows more of the dump_egs jobs than the default (5), since we # have 4 filesystems to access. We reduce the number of epochs since we have # more data and we don't want so slow down the training too much, and we also # reduce the final learning rate (when we have a lot of data we like a ratio of 10 @@ -110,7 +110,7 @@ if [ $stage -le 6 ]; then --num-threads "$num_threads" \ --minibatch-size "$minibatch_size" \ --parallel-opts "$parallel_opts" \ - --io-opts "-tc 15" \ + --io-opts "--max-jobs-run 15" \ --num-jobs-nnet 4 \ --num-epochs 5 --num-epochs-extra 2 \ --add-layers-period 2 \ diff --git a/egs/rm/s5/local/test_decoders.sh b/egs/rm/s5/local/test_decoders.sh index 53e9d1f884c..2b1d4172139 100755 --- a/egs/rm/s5/local/test_decoders.sh +++ b/egs/rm/s5/local/test_decoders.sh @@ -4,12 +4,12 @@ dir=exp/tri1/decode/tmp mkdir -p $dir acwt=0.083333 -beam=15.0 +beam=15.0 n=100 # number of utts to decode . ./path.sh -gmm-latgen-faster --max-arcs=-1 --beam=$beam --lattice-beam=6.0 --acoustic-scale=$acwt --allow-partial=true --word-symbol-table=exp/tri1/graph/words.txt exp/tri1/final.mdl exp/tri1/graph/HCLG.fst "ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:data/test/utt2spk scp:data/test/cmvn.scp 'scp:head -n $n data/test/feats.scp|' ark:- | add-deltas ark:- ark:- |" "ark:|lattice-1best --acoustic-scale=$acwt ark:- ark:- | gzip -c > $dir/lat.1.gz" 2>$dir/decode_latgen_faster.log & +gmm-latgen-faster --beam=$beam --lattice-beam=6.0 --acoustic-scale=$acwt --allow-partial=true --word-symbol-table=exp/tri1/graph/words.txt exp/tri1/final.mdl exp/tri1/graph/HCLG.fst "ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:data/test/utt2spk scp:data/test/cmvn.scp 'scp:head -n $n data/test/feats.scp|' ark:- | add-deltas ark:- ark:- |" "ark:|lattice-1best --acoustic-scale=$acwt ark:- ark:- | gzip -c > $dir/lat.1.gz" 2>$dir/decode_latgen_faster.log & gmm-decode-faster --beam=$beam --acoustic-scale=$acwt --allow-partial=true --word-symbol-table=exp/tri1/graph/words.txt exp/tri1/final.mdl exp/tri1/graph/HCLG.fst "ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:data/test/utt2spk scp:data/test/cmvn.scp 'scp:head -n $n data/test/feats.scp|' ark:- | add-deltas ark:- ark:- |" ark:/dev/null ark:/dev/null "ark:|gzip -c > $dir/lat.2.gz" 2>$dir/decode_faster.log & @@ -26,4 +26,3 @@ wait echo "$0: decoder comparison test succeeded" exit 0; - diff --git a/egs/rm/s5/path.sh b/egs/rm/s5/path.sh index c3be1ca9d0e..1a6fb5f891b 100755 --- a/egs/rm/s5/path.sh +++ b/egs/rm/s5/path.sh @@ -1,3 +1,5 @@ export KALDI_ROOT=`pwd`/../../.. -export PATH=$PWD/utils/:$KALDI_ROOT/src/bin:$KALDI_ROOT/tools/openfst/bin:$KALDI_ROOT/src/fstbin/:$KALDI_ROOT/src/gmmbin/:$KALDI_ROOT/src/featbin/:$KALDI_ROOT/src/lm/:$KALDI_ROOT/src/sgmmbin/:$KALDI_ROOT/src/sgmm2bin/:$KALDI_ROOT/src/fgmmbin/:$KALDI_ROOT/src/latbin/:$KALDI_ROOT/src/nnet2bin/:$KALDI_ROOT/src/nnetbin:$KALDI_ROOT/src/online2bin/:$KALDI_ROOT/src/ivectorbin/:$PWD:$PATH +export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH +[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1 +. $KALDI_ROOT/tools/config/common_path.sh export LC_ALL=C diff --git a/egs/rm/s5/run.sh b/egs/rm/s5/run.sh index 362e215ecfa..43ec446e6fe 100755 --- a/egs/rm/s5/run.sh +++ b/egs/rm/s5/run.sh @@ -1,6 +1,6 @@ #!/bin/bash -. cmd.sh +. ./cmd.sh set -e # exit on error @@ -26,8 +26,8 @@ local/rm_prepare_grammar_ug.sh # Unigram grammar (gives worse results, but featdir=mfcc for x in test_mar87 test_oct87 test_feb89 test_oct89 test_feb91 test_sep92 train; do - steps/make_mfcc.sh --nj 8 --cmd "run.pl" data/$x exp/make_feat/$x $featdir - #steps/make_plp.sh --nj 8 --cmd "run.pl" data/$x exp/make_feat/$x $featdir + steps/make_mfcc.sh --nj 8 --cmd "$train_cmd" data/$x exp/make_feat/$x $featdir + #steps/make_plp.sh --nj 8 --cmd "$train_cmd" data/$x exp/make_feat/$x $featdir steps/compute_cmvn_stats.sh data/$x exp/make_feat/$x $featdir done @@ -38,7 +38,7 @@ done utils/combine_data.sh data/test data/test_{mar87,oct87,feb89,oct89,feb91,sep92} steps/compute_cmvn_stats.sh data/test exp/make_feat/test $featdir -utils/subset_data_dir.sh data/train 1000 data/train.1k +utils/subset_data_dir.sh data/train 1000 data/train.1k steps/train_mono.sh --nj 4 --cmd "$train_cmd" data/train.1k data/lang exp/mono @@ -50,8 +50,6 @@ steps/train_mono.sh --nj 4 --cmd "$train_cmd" data/train.1k data/lang exp/mono utils/mkgraph.sh --mono data/lang exp/mono exp/mono/graph - - steps/decode.sh --config conf/decode.config --nj 20 --cmd "$decode_cmd" \ exp/mono/graph data/test exp/mono/decode @@ -78,14 +76,15 @@ local/test_decoders.sh # This is a test program that we run only in the steps/align_si.sh --nj 8 --cmd "$train_cmd" \ --use-graphs true data/train data/lang exp/tri1 exp/tri1_ali -# train tri2a [delta+delta-deltas] -steps/train_deltas.sh --cmd "$train_cmd" 1800 9000 \ - data/train data/lang exp/tri1_ali exp/tri2a +# the tri2a experiments are not needed downstream, so commenting them out. +# # train tri2a [delta+delta-deltas] +# steps/train_deltas.sh --cmd "$train_cmd" 1800 9000 \ +# data/train data/lang exp/tri1_ali exp/tri2a -# decode tri2a -utils/mkgraph.sh data/lang exp/tri2a exp/tri2a/graph -steps/decode.sh --config conf/decode.config --nj 20 --cmd "$decode_cmd" \ - exp/tri2a/graph data/test exp/tri2a/decode +# # decode tri2a +# utils/mkgraph.sh data/lang exp/tri2a exp/tri2a/graph +# steps/decode.sh --config conf/decode.config --nj 20 --cmd "$decode_cmd" \ +# exp/tri2a/graph data/test exp/tri2a/decode # train and decode tri2b [LDA+MLLT] steps/train_lda_mllt.sh --cmd "$train_cmd" \ @@ -151,9 +150,9 @@ steps/align_fmllr.sh --nj 8 --cmd "$train_cmd" --use-graphs true \ # # has bad transcripts, so you can filter it out. Below we demonstrate how to # # run this script. # steps/cleanup/find_bad_utts.sh --nj 20 --cmd "$train_cmd" data/train data/lang \ -# exp/tri3b_ali exp/tri3b_cleanup +# exp/tri3b_ali exp/tri3b_cleanup # # The following command will show you some of the hardest-to-align utterances in the data. -# head exp/tri3b_cleanup/all_info.sorted.txt +# head exp/tri3b_cleanup/all_info.sorted.txt ## MMI on top of tri3b (i.e. LDA+MLLT+SAT+MMI) steps/make_denlats.sh --config conf/decode.config \ @@ -173,7 +172,7 @@ steps/decode.sh --config conf/decode.config --nj 20 --cmd "$decode_cmd" \ # local/online/run_gmm.sh # local/online/run_nnet2.sh # local/online/run_baseline.sh -# Note: for online decoding with pitch, look at local/run_pitch.sh, +# Note: for online decoding with pitch, look at local/run_pitch.sh, # which calls local/online/run_gmm_pitch.sh # @@ -243,11 +242,14 @@ local/run_sgmm2.sh # # local/run_nnet2.sh -# Karel's neural net recipe. -# local/nnet/run_dnn.sh +# Karel's neural net recipe. +# local/nnet/run_dnn.sh # Karel's CNN recipe. # local/nnet/run_cnn.sh # Karel's 2D-CNN recipe (from Harish). # local/nnet/run_cnn2d.sh + +# chain recipe +# local/chain/run_tdnn_5f.sh diff --git a/egs/sprakbanken/README.txt b/egs/sprakbanken/README.txt index 1cf88788ce8..962b7cb7dbe 100644 --- a/egs/sprakbanken/README.txt +++ b/egs/sprakbanken/README.txt @@ -1,10 +1,10 @@ About the sprakbanken corpus: This corpus is a free corpus originally collected by NST for ASR purposes and currently hosted by the Norwegian libraries. The corpus is multilingual and contains Swedish, - Norwegian (Bokmål) and Danish. The current setup works for Danish. The vocabulary is + Norwegian (Bokmål) and Danish. The current setup uses the Danish subcorpus. The vocabulary is large and there is approx. 350 hours of read-aloud speech with associated text scripts. + Some months ago the corpus was republished here: http://www.nb.no/sprakbanken/#ticketsfrom?lang=en - - s1: This is the current recommended recipe. (Danish) + s5: This is the current recommended recipe. (Danish) diff --git a/egs/sprakbanken/s5/cmd.sh b/egs/sprakbanken/s5/cmd.sh index 43867ccf0d9..71dd849a93b 100644 --- a/egs/sprakbanken/s5/cmd.sh +++ b/egs/sprakbanken/s5/cmd.sh @@ -1,30 +1,15 @@ -# "queue.pl" uses qsub. The options to it are -# options to qsub. If you have GridEngine installed, -# change this to a queue you have access to. -# Otherwise, use "run.pl", which will run jobs locally -# (make sure your --num-jobs options are no more than -# the number of cpus on your machine. - -#a) JHU cluster options -#export train_cmd="queue.pl -l arch=*64" -#export decode_cmd="queue.pl -l arch=*64 --mem 2G" -#export mkgraph_cmd="queue.pl -l arch=*64 --mem 2G" -#export big_memory_cmd="queue.pl -l arch=*64 --mem 2G" -#export cuda_cmd="queue.pl -l gpu=1" - - - -#b) BUT cluster options -#export train_cmd="queue.pl -q all.q@@blade -l ram_free=1200M,mem_free=1200M" -#export decode_cmd="queue.pl -q all.q@@blade -l ram_free=1700M,mem_free=1700M" -#export decodebig_cmd="queue.pl -q all.q@@blade -l ram_free=4G,mem_free=4G" - -#export cuda_cmd="queue.pl -q long.q@@pco203 -l gpu=1" -#export cuda_cmd="queue.pl -q long.q@pcspeech-gpu" -#export mkgraph_cmd="queue.pl -q all.q@@servers -l ram_free=4G,mem_free=4G" - -#c) run it locally... -export train_cmd=run.pl -export decode_cmd=run.pl -export cuda_cmd=run.pl -export mkgraph_cmd=run.pl +# you can change cmd.sh depending on what type of queue you are using. +# If you have no queueing system and want to run on a local machine, you +# can change all instances 'queue.pl' to run.pl (but be careful and run +# commands one by one: most recipes will exhaust the memory on your +# machine). queue.pl works with GridEngine (qsub). slurm.pl works +# with slurm. Different queues are configured differently, with different +# queue names and different ways of specifying things like memory; +# to account for these differences you can create and edit the file +# conf/queue.conf to match your queue's configuration. Search for +# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, +# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. + +export train_cmd="queue.pl --mem 2G" +export decode_cmd="queue.pl --mem 4G" +export mkgraph_cmd="queue.pl --mem 8G" diff --git a/egs/sprakbanken/s5/local/nnet2/run_5d_gpu.sh b/egs/sprakbanken/s5/local/nnet2/run_5d_gpu.sh index ce3edc7a9a3..f52a0028074 100755 --- a/egs/sprakbanken/s5/local/nnet2/run_5d_gpu.sh +++ b/egs/sprakbanken/s5/local/nnet2/run_5d_gpu.sh @@ -13,7 +13,7 @@ dir=exp/nnet5d_gpu . ./cmd.sh . utils/parse_options.sh -( +( if [ ! -z "$temp_dir" ] && [ ! -e $dir/egs ]; then mkdir -p $dir @@ -32,19 +32,19 @@ dir=exp/nnet5d_gpu --p 2 \ data/train_si284 data/lang exp/tri4b_ali_si284 $dir || exit 1 - steps/decode_nnet_cpu.sh --cmd "$decode_cmd" --nj 10 \ + steps/nnet2/decode.sh --cmd "$decode_cmd" --nj 10 \ --transform-dir exp/tri4b/decode_tgpr_dev93 \ exp/tri4b/graph_tgpr data/test_dev93 $dir/decode_tgpr_dev93 - steps/decode_nnet_cpu.sh --cmd "$decode_cmd" --nj 8 \ + steps/nnet2/decode.sh --cmd "$decode_cmd" --nj 8 \ --transform-dir exp/tri4b/decode_tgpr_eval92 \ exp/tri4b/graph_tgpr data/test_eval92 $dir/decode_tgpr_eval92 - steps/decode_nnet_cpu.sh --cmd "$decode_cmd" --nj 10 \ + steps/nnet2/decode.sh --cmd "$decode_cmd" --nj 10 \ --transform-dir exp/tri4b/decode_bd_tgpr_dev93 \ exp/tri4b/graph_bd_tgpr data/test_dev93 $dir/decode_bd_tgpr_dev93 - steps/decode_nnet_cpu.sh --cmd "$decode_cmd" --nj 8 \ + steps/nnet2/decode.sh --cmd "$decode_cmd" --nj 8 \ --transform-dir exp/tri4b/decode_bd_tgpr_eval92 \ exp/tri4b/graph_bd_tgpr data/test_eval92 $dir/decode_bd_tgpr_eval92 ) diff --git a/egs/sprakbanken/s5/local/nnet2/sprak_run_5c.sh b/egs/sprakbanken/s5/local/nnet2/sprak_run_5c.sh index 8b1fed26422..4ce59dbf86d 100755 --- a/egs/sprakbanken/s5/local/nnet2/sprak_run_5c.sh +++ b/egs/sprakbanken/s5/local/nnet2/sprak_run_5c.sh @@ -1,7 +1,7 @@ #!/bin/bash # This is neural net training on top of adapted 40-dimensional features. -# +# . ./cmd.sh @@ -16,13 +16,13 @@ test2=$3 --num-hidden-layers 4 --hidden-layer-dim 1024 \ --cmd "$decode_cmd" \ data/train data/lang exp/tri4b_ali exp/nnet5c || exit 1 - - steps/decode_nnet_cpu.sh --cmd "$decode_cmd" --nj 7 \ + + steps/nnet2/decode.sh --cmd "$decode_cmd" --nj 7 \ --transform-dir exp/tri4b/decode_${uid}_$test1 \ exp/tri4b/graph_$uid data/$test1 exp/nnet5c/decode_${uid}_$test1 if [ -d $test2 ]; then - steps/decode_nnet_cpu.sh --cmd "$decode_cmd" --nj 4 \ + steps/nnet2/decode.sh --cmd "$decode_cmd" --nj 4 \ --transform-dir exp/tri4b/decode_${uid}_$test2 \ exp/tri4b/graph_${uid} data/$test2 exp/nnet5c/decode_${uid}_$test2 fi diff --git a/egs/sprakbanken/s5/local/sprak_train_cmulm.sh b/egs/sprakbanken/s5/local/sprak_train_cmulm.sh index 5b74fcdfee5..55d6d60bf9d 100755 --- a/egs/sprakbanken/s5/local/sprak_train_cmulm.sh +++ b/egs/sprakbanken/s5/local/sprak_train_cmulm.sh @@ -52,25 +52,10 @@ mkdir -p $test cp -r data/lang/* $test cat $lmdir/sprak.arpa | \ -utils/find_arpa_oovs.pl $test/words.txt > $lmdir/oovs_${lm_suffix}.txt - - # grep -v ' ' because the LM seems to have some strange and useless - # stuff in it with multiple 's in the history. Encountered some other similar - # things in a LM from Geoff. Removing all "illegal" combinations of and , - # which are supposed to occur only at being/end of utt. These can cause - # determinization failures of CLG [ends up being epsilon cycles]. -cat $lmdir/sprak.arpa | \ - grep -v ' ' | \ - grep -v ' ' | \ - grep -v ' ' | \ - arpa2fst - | fstprint | \ - utils/remove_oovs.pl $lmdir/oovs_${lm_suffix}.txt | \ - utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$test/words.txt \ - --osymbols=$test/words.txt --keep_isymbols=false --keep_osymbols=false | \ - fstrmepsilon | fstarcsort --sort_type=ilabel > $test/G.fst + arpa2fst --disambig-symbol=#0 \ + --read-symbol-table=$test/words.txt - $test/G.fst utils/validate_lang.pl $test || exit 1; exit 0; - diff --git a/egs/sprakbanken/s5/local/sprak_train_irstlm.sh b/egs/sprakbanken/s5/local/sprak_train_irstlm.sh index 7abef919e0c..33b27cc3e4c 100755 --- a/egs/sprakbanken/s5/local/sprak_train_irstlm.sh +++ b/egs/sprakbanken/s5/local/sprak_train_irstlm.sh @@ -61,8 +61,8 @@ fi # Checks if espeak is available on the system. espeak is necessary to extend -# the setup because the original transcriptions were created with espeak and -# filtered +# the setup because the original transcriptions were created with espeak and +# filtered if ! which espeak >&/dev/null; then echo "espeak is not available on your system. You must install espeak before proceeding." @@ -95,7 +95,7 @@ if [ ! -f $extdict/lexicon.txt ]; # Filter transcription - # Remove diacritics, language annotation ((da), (en), (fr) etc.), insert space between symbols, remove + # Remove diacritics, language annotation ((da), (en), (fr) etc.), insert space between symbols, remove # initial and trailing spaces and collapse 2 or more spaces to one space cat $dir/plist.txt | perl -pe 's/\([[a-z]{2}\)//g' | perl -pe 's// /g' | perl -pe 's/ a I / aI /g' | perl -pe 's/ d Z / dZ /g' | perl -pe 's/ \? / /g' | perl -pe 's/ ([\#]) /\+ /g' | perl -pe 's/([\@n3]) \- /\1\- /g' | perl -pe "s/[\_\:\!\'\,\|2]//g" | perl -pe 's/ \- / /g' | tr -s ' ' | perl -pe 's/^ +| +$//g' > $dir/plist2.txt @@ -128,7 +128,7 @@ if [ ! -f $lmdir/extra4.ngt ]; grep -P -v '^[\s?|\.|\!]*$' $newtext | \ awk '{if(NF>=4){ printf("%s\n",$0); }}' > $lmdir/text.filt - + # Envelop LM training data in context cues add-start-end.sh < $lmdir/text.filt > $lmdir/lm_input @@ -151,22 +151,8 @@ mkdir -p $test cp -r $extlang $test cat $lmdir/extra${N}$lm_suffix | \ -utils/find_arpa_oovs.pl $test/words.txt > $lmdir/oovs_${lm_suffix}.txt - - # grep -v ' ' because the LM seems to have some strange and useless - # stuff in it with multiple 's in the history. Encountered some other similar - # things in a LM from Geoff. Removing all "illegal" combinations of and , - # which are supposed to occur only at being/end of utt. These can cause - # determinization failures of CLG [ends up being epsilon cycles]. -cat $lmdir/extra${N}$lm_suffix | \ - grep -v ' ' | \ - grep -v ' ' | \ - grep -v ' ' | \ - arpa2fst - | fstprint | \ - utils/remove_oovs.pl $lmdir/oovs_${lm_suffix}.txt | \ - utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$test/words.txt \ - --osymbols=$test/words.txt --keep_isymbols=false --keep_osymbols=false | \ - fstrmepsilon | fstarcsort --sort_type=ilabel > $test/G.fst + arpa2fst --disambig-symbol=#0 \ + --read-symbol-table=$test/words.txt - $test/G.fst utils/validate_lang.pl $test || exit 1; diff --git a/egs/sprakbanken/s5/local/sprak_train_rnnlms.sh b/egs/sprakbanken/s5/local/sprak_train_rnnlms.sh index 4d5c0cbb462..16233da5d65 100755 --- a/egs/sprakbanken/s5/local/sprak_train_rnnlms.sh +++ b/egs/sprakbanken/s5/local/sprak_train_rnnlms.sh @@ -28,32 +28,10 @@ devtext=$2 dir=$3 mkdir -p $dir +$KALDI_ROOT/tools/extras/check_for_rnnlm.sh "$rnnlm_ver" || exit 1 export PATH=$KALDI_ROOT/tools/$rnnlm_ver:$PATH -( # First make sure the kaldi_lm toolkit is installed. - # Note: this didn't work out of the box for me, I had to - # change the g++ version to just "g++" (no cross-compilation - # needed for me as I ran on a machine that had been setup - # as 64 bit by default. - cd $KALDI_ROOT/tools || exit 1; - if [ -d $rnnlm_ver ]; then - echo Not installing the rnnlm toolkit since it is already there. - else - echo Downloading and installing the rnnlm tools - # http://www.fit.vutbr.cz/~imikolov/rnnlm/$rnnlm_ver.tgz - if [ ! -f $rnnlm_ver.tgz ]; then - wget http://www.fit.vutbr.cz/~imikolov/rnnlm/$rnnlm_ver.tgz || exit 1; - fi - mkdir $rnnlm_ver - cd $rnnlm_ver - tar -xvzf ../$rnnlm_ver.tgz || exit 1; - make CC=g++ || exit 1; - echo Done making the rnnlm tools - fi -) || exit 1; - - if [ ! -f $srcdir/transcripts.uniq ] || [ ! -f $srcdir/lexicon.txt ]; then echo "Expecting $srcdir/transcripts.uniq and $srcdir/lexicon.txt to exist"; exit 1; diff --git a/egs/sprakbanken/s5/local/train_irstlm.sh b/egs/sprakbanken/s5/local/train_irstlm.sh index f0b649dd1c7..c91b68f8aab 100755 --- a/egs/sprakbanken/s5/local/train_irstlm.sh +++ b/egs/sprakbanken/s5/local/train_irstlm.sh @@ -66,22 +66,8 @@ mkdir -p $test cp -r $srcdir/* $test cat $lmdir/train${ngram}.arpa | \ - utils/find_arpa_oovs.pl $test/words.txt > $lmdir/oovs_${lm_suffix}.txt - - # grep -v ' ' because the LM seems to have some strange and useless - # stuff in it with multiple 's in the history. Encountered some other similar - # things in a LM from Geoff. Removing all "illegal" combinations of and , - # which are supposed to occur only at being/end of utt. These can cause - # determinization failures of CLG [ends up being epsilon cycles]. -cat $lmdir/train${ngram}.arpa | \ - grep -v ' ' | \ - grep -v ' ' | \ - grep -v ' ' | \ - arpa2fst - | fstprint | \ - utils/remove_oovs.pl $lmdir/oovs_${lm_suffix}.txt | \ - utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$test/words.txt \ - --osymbols=$test/words.txt --keep_isymbols=false --keep_osymbols=false | \ - fstrmepsilon | fstarcsort --sort_type=ilabel > $test/G.fst + arpa2fst --disambig-symbol=#0 \ + --read-symbol-table=$test/words.txt - $test/G.fst utils/validate_lang.pl $test || exit 1; diff --git a/egs/sprakbanken/s5/path.sh b/egs/sprakbanken/s5/path.sh index 9df7df54e99..2d17b17a84a 100755 --- a/egs/sprakbanken/s5/path.sh +++ b/egs/sprakbanken/s5/path.sh @@ -1,4 +1,6 @@ export KALDI_ROOT=`pwd`/../../.. -[ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh -export PATH=$PWD/utils/:$KALDI_ROOT/src/bin:$KALDI_ROOT/tools/openfst/bin:$KALDI_ROOT/src/fstbin/:$KALDI_ROOT/src/gmmbin/:$KALDI_ROOT/src/featbin/:$KALDI_ROOT/src/lm/:$KALDI_ROOT/src/sgmmbin/:$KALDI_ROOT/src/sgmm2bin/:$KALDI_ROOT/src/fgmmbin/:$KALDI_ROOT/src/latbin/:$KALDI_ROOT/src/nnetbin:$KALDI_ROOT/src/nnet2bin/:$KALDI_ROOT/src/kwsbin:$PWD:$PATH +[ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh +export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH +[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1 +. $KALDI_ROOT/tools/config/common_path.sh export LC_ALL=C diff --git a/egs/sre08/v1/cmd.sh b/egs/sre08/v1/cmd.sh index 5c38b3a5d77..d1ca1a6d126 100644 --- a/egs/sre08/v1/cmd.sh +++ b/egs/sre08/v1/cmd.sh @@ -1,28 +1,15 @@ -# "queue.pl" uses qsub. The options to it are -# options to qsub. If you have GridEngine installed, -# change this to a queue you have access to. -# Otherwise, use "run.pl", which will run jobs locally -# (make sure your --num-jobs options are no more than -# the number of cpus on your machine. - -#a) JHU cluster options -export train_cmd="queue.pl -l arch=*64*" -export decode_cmd="queue.pl -l arch=*64* -l ram_free=4G,mem_free=4G" -#export cuda_cmd="..." -export mkgraph_cmd="queue.pl -l arch=*64* ram_free=4G,mem_free=4G" - -#b) BUT cluster options -#export train_cmd="queue.pl -q all.q@@blade -l ram_free=1200M,mem_free=1200M" -#export decode_cmd="queue.pl -q all.q@@blade -l ram_free=1700M,mem_free=1700M" -#export decodebig_cmd="queue.pl -q all.q@@blade -l ram_free=4G,mem_free=4G" -#export cuda_cmd="queue.pl -q long.q@@pco203 -l gpu=1" -#export cuda_cmd="queue.pl -q long.q@pcspeech-gpu" -#export mkgraph_cmd="queue.pl -q all.q@@servers -l ram_free=4G,mem_free=4G" - -#c) run it locally... -#export train_cmd=run.pl -#export decode_cmd=run.pl -export cuda_cmd=run.pl -#export mkgraph_cmd=run.pl +# you can change cmd.sh depending on what type of queue you are using. +# If you have no queueing system and want to run on a local machine, you +# can change all instances 'queue.pl' to run.pl (but be careful and run +# commands one by one: most recipes will exhaust the memory on your +# machine). queue.pl works with GridEngine (qsub). slurm.pl works +# with slurm. Different queues are configured differently, with different +# queue names and different ways of specifying things like memory; +# to account for these differences you can create and edit the file +# conf/queue.conf to match your queue's configuration. Search for +# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, +# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. + +export train_cmd="queue.pl --mem 4G" diff --git a/egs/sre08/v1/path.sh b/egs/sre08/v1/path.sh index 7cf73af8c53..e50f57c5271 100755 --- a/egs/sre08/v1/path.sh +++ b/egs/sre08/v1/path.sh @@ -1,3 +1,5 @@ -export KALDI_ROOT=$(cd ../../..; pwd) -export PATH=$PWD/utils/:$KALDI_ROOT/src/bin:$KALDI_ROOT/tools/openfst/bin:$KALDI_ROOT/src/fstbin/:$KALDI_ROOT/src/gmmbin/:$KALDI_ROOT/src/featbin/:$KALDI_ROOT/src/lm/:$KALDI_ROOT/src/sgmmbin/:$KALDI_ROOT/src/sgmm2bin/:$KALDI_ROOT/src/fgmmbin/:$KALDI_ROOT/src/latbin/:$KALDI_ROOT/src/nnetbin:$KALDI_ROOT/src/nnet2bin:$KALDI_ROOT/src/ivectorbin:$KALDI_ROOT/tools/sph2pipe_v2.5:$PWD:$PATH +export KALDI_ROOT=`pwd`/../../.. +export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$KALDI_ROOT/tools/sph2pipe_v2.5:$PWD:$PATH +[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1 +. $KALDI_ROOT/tools/config/common_path.sh export LC_ALL=C diff --git a/egs/sre08/v1/sid/compute_vad_decision_gmm.sh b/egs/sre08/v1/sid/compute_vad_decision_gmm.sh new file mode 100755 index 00000000000..b1fee318f34 --- /dev/null +++ b/egs/sre08/v1/sid/compute_vad_decision_gmm.sh @@ -0,0 +1,161 @@ +#!/bin/bash + +# Copyright 2015 David Snyder +# Apache 2.0 +# +# Compute GMM-based VAD output and optionally combine with +# the energy-based VAD decisions. + +nj=10 +cmd=run.pl +map_config= +merge_map_config= +priors= +use_energy_vad=false +num_gselect=20 +norm_vars=false +center=true +stage=-4 + +echo "$0 $@" # Print the command line for logging + +if [ -f path.sh ]; then . ./path.sh; fi +. parse_options.sh || exit 1; + +if [ $# -lt 5 ]; then + echo "Usage: $0 [options] ... "; + echo "e.g.: $0 data/train exp/music_gmm exp/speech_gmm exp/noise_gmm exp/gmm_vad exp/gmm_vad" + echo " Options:" + echo " --map-config # config passed to compute-vad-from-frame-likes" + echo " --priors # list passed to compute-vad-from-frame-likes" + echo " --merge-map-config # config passed to merge-vads" + echo " --use-energy-vad # If true, look for a vad.scp file and combine it with this VAD" + echo " --nj # number of parallel jobs" + echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs." + exit 1; +fi + +args=("$@") +gmm_dirs=(${@:2:$(($#-3))}) # The GMM directories +num_gmms=`expr $# - 3` + +data=${args[0]} +log_dir=${args[$num_gmms+1]} +vad_dir=${args[$num_gmms+2]} + +# make $vad_dir an absolute pathname. +vad_dir=`perl -e '($dir,$pwd)= @ARGV; if($dir!~m:^/:) { $dir = "$pwd/$dir"; } print $dir; ' ${vad_dir} ${PWD}` +# use "name" as part of name of the archive. +name=`basename $data` + +mkdir -p $vad_dir || exit 1; +mkdir -p $log_dir || exit 1; + +if $use_energy_vad; then + for f in $data/vad.scp "$merge_map_config"; do + if [ ! -f $f ]; then + echo "compute_vad_decision_gmm.sh: no such file $f" + exit 1; + fi + done +fi + +if [ ! -f $data/feats.scp ]; then + echo "compute_vad_decision_gmm.sh: no such file $f" + exit 1; +fi + +utils/split_data.sh $data $nj || exit 1; +sdata=$data/split$nj; + +# We assume that the same delta-opts is used for each +# GMM dir. +delta_opts=`cat ${gmm_dirs[0]}/delta_opts 2>/dev/null` +if [ -f ${gmm_dirs[0]}/delta_opts ]; then + cp ${gmm_dirs[0]}/delta_opts $dir/ 2>/dev/null +fi + +## Set up features. +feats="ark,s,cs:add-deltas $delta_opts scp:$sdata/JOB/feats.scp ark:- | apply-cmvn-sliding --norm-vars=$norm_vars --center=$center --cmn-window=300 ark:- ark:- |" + +if [ $stage -le -2 ]; then + for gmm_dir in "${gmm_dirs[@]}"; + do + gmm_name=`basename $gmm_dir` + $cmd ${log_dir}/log/${gmm_name}_convert.log \ + fgmm-global-to-gmm ${gmm_dir}/final.ubm ${vad_dir}/${gmm_name}_final.dubm || exit 1; + done +fi + +if [ $stage -le -1 ]; then + echo "$0: doing Gaussian selection" + for gmm_dir in "${gmm_dirs[@]}"; + do + gmm_name=`basename $gmm_dir` + $cmd JOB=1:$nj ${log_dir}/log/${gmm_name}_gselect.JOB.log \ + gmm-gselect --n=$num_gselect ${vad_dir}/${gmm_name}_final.dubm "$feats" ark:- \| \ + fgmm-gselect --gselect=ark,s,cs:- --n=${num_gselect} ${gmm_dir}/final.ubm \ + "$feats" "ark:|gzip -c >${vad_dir}/${gmm_name}_gselect.JOB.gz" || exit 1; + done +fi + +frame_likes="" +if [ $stage -le 0 ]; then + echo "$0: computing frame likelihoods" + for gmm_dir in "${gmm_dirs[@]}"; + do + gmm_name=`basename $gmm_dir` + frame_likes="${frame_likes} ark:${vad_dir}/${gmm_name}_logprob.JOB.ark" + $cmd JOB=1:$nj ${log_dir}/log/get_${gmm_name}_logprob.JOB.log \ + fgmm-global-get-frame-likes --average=false \ + "--gselect=ark,s,cs:gunzip -c ${vad_dir}/${gmm_name}_gselect.JOB.gz|" ${gmm_dir}/final.ubm \ + "$feats" ark:${vad_dir}/${gmm_name}_logprob.JOB.ark || exit 1; + done + + echo "$0: computing VAD decisions from frame likelihoods" + $cmd JOB=1:$nj ${log_dir}/log/make_vad_gmm_${name}.JOB.log \ + compute-vad-from-frame-likes --map=${map_config} --priors=$priors $frame_likes \ + ark,scp:${vad_dir}/vad_gmm_${name}.JOB.ark,${vad_dir}/vad_gmm_${name}.JOB.scp \ + || exit 1; + + if $use_energy_vad ; then + echo "$0: merging with energy-based VAD decisions" + $cmd JOB=1:$nj ${log_dir}/log/merge_vads_${name}.JOB.log \ + merge-vads --map=${merge_map_config} scp:$sdata/JOB/vad.scp \ + scp:${vad_dir}/vad_gmm_${name}.JOB.scp \ + ark,scp:${vad_dir}/vad_merged_${name}.JOB.ark,${vad_dir}/vad_merged_${name}.JOB.scp \ + || exit 1; + fi + + echo "$0: moving old vad.scp to ${data}/vad.scp.bak" + mv ${data}/vad.scp ${data}/vad.scp.bak + + for ((n=1; n<=nj; n++)); do + if $use_energy_vad ; then + cat ${vad_dir}/vad_merged_${name}.$n.scp || exit 1; + else + cat ${vad_dir}/vad_gmm_${name}.$n.scp || exit 1; + fi + done > ${data}/vad.scp +fi + +nc=`cat $data/vad.scp | wc -l` +nu=`cat $data/feats.scp | wc -l` +if [ $nc -ne $nu ]; then + echo "**Warning it seems not all of the speakers got VAD output ($nc != $nu);" + echo "**validate_data_dir.sh will fail; you might want to use fix_data_dir.sh" + [ $nc -eq 0 ] && exit 1; +fi + +echo "$0 created GMM-based VAD output for $name" + +if $cleanup ; then + for gmm_dir in "${gmm_dirs[@]}"; + do + gmm_name=`basename $gmm_dir` + rm ${vad_dir}/${gmm_name}_gselect.*.gz + rm ${vad_dir}/${gmm_name}_logprob.*.ark + done +fi + +exit 0; diff --git a/egs/sre08/v1/sid/init_full_ubm_from_dnn.sh b/egs/sre08/v1/sid/init_full_ubm_from_dnn.sh index 22c5de9b9c3..f6710028ae5 100755 --- a/egs/sre08/v1/sid/init_full_ubm_from_dnn.sh +++ b/egs/sre08/v1/sid/init_full_ubm_from_dnn.sh @@ -13,7 +13,6 @@ cmd="run.pl" stage=-2 delta_window=3 delta_order=2 -num_components=5297 # End configuration section. echo "$0 $@" # Print the command line for logging @@ -31,7 +30,6 @@ if [ $# != 4 ]; then echo " --nj # number of parallel training jobs" echo " --delta-window # delta window size" echo " --delta-order # delta order" - echo " --number-components # number of components in the final GMM needs" echo " # to be equal to the size of the DNN output layer." exit 1; fi @@ -41,7 +39,9 @@ data_dnn=$2 nnet=$3 dir=$4 -for f in $data/feats.scp $data/vad.scp; do + +for f in $data/feats.scp $data/vad.scp ${data_dnn}/feats.scp \ + ${data_dnn}/vad.scp $nnet; do [ ! -f $f ] && echo "No such file $f" && exit 1; done @@ -64,6 +64,11 @@ feats="ark,s,cs:add-deltas $delta_opts scp:$sdata/JOB/feats.scp ark:- | \ apply-cmvn-sliding --norm-vars=false --center=true --cmn-window=300 ark:- ark:- | \ select-voiced-frames ark:- scp,s,cs:$sdata/JOB/vad.scp ark:- |" +# Parse the output of nnet-am-info to find the size of the output layer +# of the TDNN. This will also correspond to the number of components +# in the ancillary GMM. +num_components=`grep -oP 'output-dim\ \K[0-9]+' <(nnet-am-info $nnet 2> /dev/null)` + $cmd JOB=1:$nj $logdir/make_stats.JOB.log \ nnet-am-compute --apply-log=true $nnet "$nnet_feats" ark:- \ \| select-voiced-frames ark:- scp,s,cs:$sdata/JOB/vad.scp ark:- \ diff --git a/egs/sre08/v1/sid/music_id.sh b/egs/sre08/v1/sid/music_id.sh new file mode 100755 index 00000000000..4233b5752fd --- /dev/null +++ b/egs/sre08/v1/sid/music_id.sh @@ -0,0 +1,134 @@ +#!/bin/bash + +# Copyright 2015 David Snyder +# Apache 2.0. + +# This script calculates the relative probability of music versus +# speech. + +# Begin configuration section. +nj=10 +cmd="run.pl" +stage=-4 +num_gselect=20 # Gaussian-selection using diagonal and full covariance models +norm_vars=false +center=true +cleanup=true +# End configuration section. + +echo "$0 $@" # Print the command line for logging + +if [ -f path.sh ]; then . ./path.sh; fi +. parse_options.sh || exit 1; + + +if [ $# != 4 ]; then + echo "Usage: $0 " + echo " e.g.: $0 exp/full_ubm_music exp/full_ubm_speech data/test exp/test_results" + echo "main options (for others, see top of script file)" + echo " --config # config containing options" + echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs." + echo " --nj # Number of jobs (also see num-processes and num-threads)" + echo " --cleanup # If true, clean up temporary files" + echo " --num-processes # Number of processes for each queue job (relates" + echo " # to summing accs in memory)" + echo " --stage # To control partial reruns" + echo " --num-gselect # Number of Gaussians to select using" + echo " # diagonal model." + exit 1; +fi + +music_ubmdir=$1 +speech_ubmdir=$2 +data=$3 +dir=$4 + +delta_opts=`cat $speech_ubmdir/delta_opts 2>/dev/null` + +for f in $music_ubmdir/final.ubm $speech_ubmdir/final.ubm $data/feats.scp $data/vad.scp; do + [ ! -f $f ] && echo "No such file $f" && exit 1; +done + +# Set various variables. +mkdir -p $dir/log || exit 1; +sdata=$data/split$nj +utils/split_data.sh $data $nj || exit 1; + +## Set up features. +feats="ark,s,cs:add-deltas $delta_opts scp:$sdata/JOB/feats.scp ark:- | apply-cmvn-sliding --norm-vars=$norm_vars --center=$center --cmn-window=300 ark:- ark:- | select-voiced-frames ark:- scp,s,cs:$sdata/JOB/vad.scp ark:- |" + +if [ $stage -le -2 ]; then + $cmd $dir/log/music_convert.log \ + fgmm-global-to-gmm $music_ubmdir/final.ubm $dir/music_final.dubm || exit 1; +fi +if [ $stage -le -2 ]; then + $cmd $dir/log/speech_convert.log \ + fgmm-global-to-gmm $speech_ubmdir/final.ubm $dir/speech_final.dubm || exit 1; +fi + +# Do Gaussian selection using the diagonal forms of the models. + +if [ $stage -le -1 ]; then + echo $nj > $dir/num_jobs + echo "$0: doing Gaussian selection for music UBM" + $cmd JOB=1:$nj $dir/log/music_gselect.JOB.log \ + gmm-gselect --n=$num_gselect $dir/music_final.dubm "$feats" ark:- \| \ + fgmm-gselect --gselect=ark,s,cs:- --n=$num_gselect $music_ubmdir/final.ubm \ + "$feats" "ark:|gzip -c >$dir/music_gselect.JOB.gz" || exit 1; + + echo $nj > $dir/num_jobs + echo "$0: doing Gaussian selection for speech UBM" + $cmd JOB=1:$nj $dir/log/speech_gselect.JOB.log \ + gmm-gselect --n=$num_gselect $dir/speech_final.dubm "$feats" ark:- \| \ + fgmm-gselect --gselect=ark,s,cs:- --n=$num_gselect $speech_ubmdir/final.ubm \ + "$feats" "ark:|gzip -c >$dir/speech_gselect.JOB.gz" || exit 1; +fi + +if ! [ $nj -eq $(cat $dir/num_jobs) ]; then + echo "Number of jobs mismatch" + exit 1; +fi + +# Calculate the average frame-level log-likelihoods for the utterances under +# the music and speech UBMs. +if [ $stage -le 0 ]; then + $cmd JOB=1:$nj $dir/log/get_music_logprob.JOB.log \ + fgmm-global-get-frame-likes --average=true \ + "--gselect=ark,s,cs:gunzip -c $dir/music_gselect.JOB.gz|" $music_ubmdir/final.ubm \ + "$feats" ark,t:$dir/music_logprob.JOB || exit 1; +fi +if [ $stage -le 1 ]; then + $cmd JOB=1:$nj $dir/log/get_speech_logprob.JOB.log \ + fgmm-global-get-frame-likes --average=true \ + "--gselect=ark,s,cs:gunzip -c $dir/speech_gselect.JOB.gz|" $speech_ubmdir/final.ubm \ + "$feats" ark,t:$dir/speech_logprob.JOB || exit 1; +fi + +if [ $stage -le 2 ]; then + + for j in $(seq $nj); do cat $dir/music_logprob.$j; done > $dir/music_logprob + for j in $(seq $nj); do cat $dir/speech_logprob.$j; done > $dir/speech_logprob + + n1=$(cat $dir/music_logprob | wc -l) + n2=$(cat $dir/speech_logprob | wc -l) + + if [ $n1 -ne $n2 ]; then + echo "Number of lines mismatch, music versus speech UBM probs: $n1 vs $n2" + exit 1; + fi + + paste $dir/music_logprob $dir/speech_logprob | \ + awk '{if ($1 != $3) { print >/dev/stderr "Sorting mismatch"; exit(1); } print $1, $2, $4;}' \ + >$dir/logprob || exit 1; + + cat $dir/logprob | \ + awk '{lratio = $2-$3; print $1, 1/(1+exp(-lratio));}' \ + >$dir/ratio || exit 1; +fi + +if $cleanup; then + rm $dir/speech_gselect.*.gz + rm $dir/music_gselect.*.gz +fi + +exit 0; diff --git a/egs/sre10/README.txt b/egs/sre10/README.txt index 5f9c0337550..8390136d52b 100644 --- a/egs/sre10/README.txt +++ b/egs/sre10/README.txt @@ -10,8 +10,9 @@ are required by the subdirectories. See the corresponding README.txt files in the subdirectories for more details. - The subdirectories "v1" and so on are different versions of the recipe; - we don't call them "s1" etc., because they don't really correspond to - the speech recognition recipes. - + The subdirectories "v1" and so on are different iVector-based speaker + recognition recipes. The recipe in v1 demonstrates a standard approach + using a full-covariance GMM-UBM, iVectors, and a PLDA backend. The example + in v2 replaces the GMM of the v1 recipe with a time-delay deep neural + network. diff --git a/egs/sre10/v1/cmd.sh b/egs/sre10/v1/cmd.sh index 5c38b3a5d77..d1ca1a6d126 100755 --- a/egs/sre10/v1/cmd.sh +++ b/egs/sre10/v1/cmd.sh @@ -1,28 +1,15 @@ -# "queue.pl" uses qsub. The options to it are -# options to qsub. If you have GridEngine installed, -# change this to a queue you have access to. -# Otherwise, use "run.pl", which will run jobs locally -# (make sure your --num-jobs options are no more than -# the number of cpus on your machine. - -#a) JHU cluster options -export train_cmd="queue.pl -l arch=*64*" -export decode_cmd="queue.pl -l arch=*64* -l ram_free=4G,mem_free=4G" -#export cuda_cmd="..." -export mkgraph_cmd="queue.pl -l arch=*64* ram_free=4G,mem_free=4G" - -#b) BUT cluster options -#export train_cmd="queue.pl -q all.q@@blade -l ram_free=1200M,mem_free=1200M" -#export decode_cmd="queue.pl -q all.q@@blade -l ram_free=1700M,mem_free=1700M" -#export decodebig_cmd="queue.pl -q all.q@@blade -l ram_free=4G,mem_free=4G" -#export cuda_cmd="queue.pl -q long.q@@pco203 -l gpu=1" -#export cuda_cmd="queue.pl -q long.q@pcspeech-gpu" -#export mkgraph_cmd="queue.pl -q all.q@@servers -l ram_free=4G,mem_free=4G" - -#c) run it locally... -#export train_cmd=run.pl -#export decode_cmd=run.pl -export cuda_cmd=run.pl -#export mkgraph_cmd=run.pl +# you can change cmd.sh depending on what type of queue you are using. +# If you have no queueing system and want to run on a local machine, you +# can change all instances 'queue.pl' to run.pl (but be careful and run +# commands one by one: most recipes will exhaust the memory on your +# machine). queue.pl works with GridEngine (qsub). slurm.pl works +# with slurm. Different queues are configured differently, with different +# queue names and different ways of specifying things like memory; +# to account for these differences you can create and edit the file +# conf/queue.conf to match your queue's configuration. Search for +# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, +# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. + +export train_cmd="queue.pl --mem 4G" diff --git a/egs/sre10/v1/local/dnn/fisher_create_test_lang.sh b/egs/sre10/v1/local/dnn/fisher_create_test_lang.sh index aaa45f8e4e1..1d7c4013b83 100755 --- a/egs/sre10/v1/local/dnn/fisher_create_test_lang.sh +++ b/egs/sre10/v1/local/dnn/fisher_create_test_lang.sh @@ -1,4 +1,4 @@ -#!/bin/bash +#!/bin/bash # if [ -f path.sh ]; then . path.sh; fi @@ -11,26 +11,13 @@ arpa_lm=data/local/lm/3gram-mincount/lm_unpruned.gz mkdir -p data/lang_test cp -r data/lang/* data/lang_test -# grep -v ' ' etc. is only for future-proofing this script. Our -# LM doesn't have these "invalid combinations". These can cause -# determinization failures of CLG [ends up being epsilon cycles]. -# Note: remove_oovs.pl takes a list of words in the LM that aren't in -# our word list. Since our LM doesn't have any, we just give it -# /dev/null [we leave it in the script to show how you'd do it]. gunzip -c "$arpa_lm" | \ - grep -v ' ' | \ - grep -v ' ' | \ - grep -v ' ' | \ - arpa2fst - | fstprint | \ - utils/remove_oovs.pl /dev/null | \ - utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=data/lang_test/words.txt \ - --osymbols=data/lang_test/words.txt --keep_isymbols=false --keep_osymbols=false | \ - fstrmepsilon | fstarcsort --sort_type=ilabel > data/lang_test/G.fst - fstisstochastic data/lang_test/G.fst + arpa2fst --disambig-symbol=#0 \ + --read-symbol-table=data/lang_test/words.txt - data/lang_test/G.fst echo "Checking how stochastic G is (the first of these numbers should be small):" -fstisstochastic data/lang_test/G.fst +fstisstochastic data/lang_test/G.fst ## Check lexicon. ## just have a look and make sure it seems sane. @@ -59,4 +46,3 @@ fsttablecompose data/lang/L_disambig.fst data/lang_test/G.fst | \ echo "$0 succeeded" - diff --git a/egs/sre10/v1/local/dnn/train_dnn.sh b/egs/sre10/v1/local/dnn/train_dnn.sh index e1ce8ae8e79..d9330e58b69 100755 --- a/egs/sre10/v1/local/dnn/train_dnn.sh +++ b/egs/sre10/v1/local/dnn/train_dnn.sh @@ -15,7 +15,7 @@ set -e local/dnn/fisher_data_prep.sh /export/corpora3/LDC/LDC2004T19 /export/corpora3/LDC/LDC2005T19 \ /export/corpora3/LDC/LDC2004S13 /export/corpora3/LDC/LDC2005S13 # You could also try specifying the --calldata argument to this command as below. -# If specified, the script will use actual speaker personal identification +# If specified, the script will use actual speaker personal identification # numbers released with the dataset, i.e. real speaker IDs. Note: --calldata has # to be the first argument of this script. # local/fisher_data_prep.sh --calldata /export/corpora3/LDC/LDC2004T19 /export/corpora3/LDC/LDC2005T19 \ @@ -28,7 +28,7 @@ local/dnn/fisher_prepare_dict.sh utils/prepare_lang.sh data/local/dict "" data/local/lang data/lang -local/dnn/fisher_train_lms.sh +local/dnn/fisher_train_lms.sh local/dnn/fisher_create_test_lang.sh # Use the first 4k sentences as dev set. Note: when we trained the LM, we used @@ -55,12 +55,12 @@ utils/subset_data_dir.sh --first data/dev_and_test_asr 5000 data/dev_asr utils/subset_data_dir.sh --last data/dev_and_test_asr 5000 data/test_asr rm -r data/dev_and_test_asr -steps/compute_cmvn_stats.sh data/dev_asr exp/make_mfcc/dev_asr $mfccdir -steps/compute_cmvn_stats.sh data/test_asr exp/make_mfcc/test_asr $mfccdir +steps/compute_cmvn_stats.sh data/dev_asr exp/make_mfcc/dev_asr $mfccdir +steps/compute_cmvn_stats.sh data/test_asr exp/make_mfcc/test_asr $mfccdir n=$[`cat data/train_all_asr/segments | wc -l` - 10000] utils/subset_data_dir.sh --last data/train_all_asr $n data/train_asr -steps/compute_cmvn_stats.sh data/train_asr exp/make_mfcc/train_asr $mfccdir +steps/compute_cmvn_stats.sh data/train_asr exp/make_mfcc/train_asr $mfccdir # Now-- there are 1.6 million utterances, and we want to start the monophone training @@ -75,30 +75,30 @@ utils/subset_data_dir.sh --speakers data/train_asr 30000 data/train_asr_30k utils/subset_data_dir.sh --speakers data/train_asr 100000 data/train_asr_100k -# The next commands are not necessary for the scripts to run, but increase -# efficiency of data access by putting the mfcc's of the subset +# The next commands are not necessary for the scripts to run, but increase +# efficiency of data access by putting the mfcc's of the subset # in a contiguous place in a file. -( . path.sh; +( . path.sh; # make sure mfccdir is defined as above.. - cp data/train_asr_10k_nodup/feats.scp{,.bak} + cp data/train_asr_10k_nodup/feats.scp{,.bak} copy-feats scp:data/train_asr_10k_nodup/feats.scp ark,scp:$mfccdir/kaldi_fish_10k_nodup.ark,$mfccdir/kaldi_fish_10k_nodup.scp \ && cp $mfccdir/kaldi_fish_10k_nodup.scp data/train_asr_10k_nodup/feats.scp ) -( . path.sh; +( . path.sh; # make sure mfccdir is defined as above.. - cp data/train_asr_30k/feats.scp{,.bak} + cp data/train_asr_30k/feats.scp{,.bak} copy-feats scp:data/train_asr_30k/feats.scp ark,scp:$mfccdir/kaldi_fish_30k.ark,$mfccdir/kaldi_fish_30k.scp \ && cp $mfccdir/kaldi_fish_30k.scp data/train_asr_30k/feats.scp ) -( . path.sh; +( . path.sh; # make sure mfccdir is defined as above.. - cp data/train_asr_100k/feats.scp{,.bak} + cp data/train_asr_100k/feats.scp{,.bak} copy-feats scp:data/train_asr_100k/feats.scp ark,scp:$mfccdir/kaldi_fish_100k.ark,$mfccdir/kaldi_fish_100k.scp \ && cp $mfccdir/kaldi_fish_100k.scp data/train_asr_100k/feats.scp ) steps/train_mono.sh --nj 10 --cmd "$train_cmd" \ - data/train_asr_10k_nodup data/lang exp/mono0a + data/train_asr_10k_nodup data/lang exp/mono0a steps/align_si.sh --nj 30 --cmd "$train_cmd" \ data/train_asr_30k data/lang exp/mono0a exp/mono0a_ali || exit 1; @@ -109,7 +109,7 @@ steps/train_deltas.sh --cmd "$train_cmd" \ (utils/mkgraph.sh data/lang_test exp/tri1 exp/tri1/graph steps/decode.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \ - exp/tri1/graph data/dev exp/tri1/decode_dev)& + exp/tri1/graph data/dev_asr exp/tri1/decode_dev)& steps/align_si.sh --nj 30 --cmd "$train_cmd" \ data/train_asr_30k data/lang exp/tri1 exp/tri1_ali || exit 1; @@ -120,7 +120,7 @@ steps/train_deltas.sh --cmd "$train_cmd" \ ( utils/mkgraph.sh data/lang_test exp/tri2 exp/tri2/graph || exit 1; steps/decode.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \ - exp/tri2/graph data/dev exp/tri2/decode_dev || exit 1; + exp/tri2/graph data/dev_asr exp/tri2/decode_dev || exit 1; )& @@ -134,11 +134,11 @@ steps/train_lda_mllt.sh --cmd "$train_cmd" \ ( utils/mkgraph.sh data/lang_test exp/tri3a exp/tri3a/graph || exit 1; steps/decode.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \ - exp/tri3a/graph data/dev exp/tri3a/decode_dev || exit 1; + exp/tri3a/graph data/dev_asr exp/tri3a/decode_dev || exit 1; )& -# Next we'll use fMLLR and train with SAT (i.e. on +# Next we'll use fMLLR and train with SAT (i.e. on # fMLLR features) steps/align_fmllr.sh --nj 30 --cmd "$train_cmd" \ @@ -150,10 +150,9 @@ steps/train_sat.sh --cmd "$train_cmd" \ ( utils/mkgraph.sh data/lang_test exp/tri4a exp/tri4a/graph steps/decode_fmllr.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \ - exp/tri4a/graph data/dev exp/tri4a/decode_dev + exp/tri4a/graph data/dev_asr exp/tri4a/decode_dev )& - steps/align_fmllr.sh --nj 30 --cmd "$train_cmd" \ data/train_asr data/lang exp/tri4a exp/tri4a_ali || exit 1; @@ -164,7 +163,7 @@ steps/train_sat.sh --cmd "$train_cmd" \ ( utils/mkgraph.sh data/lang_test exp/tri5a exp/tri5a/graph steps/decode_fmllr.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \ - exp/tri5a/graph data/dev exp/tri5a/decode_dev + exp/tri5a/graph data/dev_asr exp/tri5a/decode_dev )& # this will help find issues with the lexicon. diff --git a/egs/sre10/v1/path.sh b/egs/sre10/v1/path.sh index 7cf73af8c53..e50f57c5271 100755 --- a/egs/sre10/v1/path.sh +++ b/egs/sre10/v1/path.sh @@ -1,3 +1,5 @@ -export KALDI_ROOT=$(cd ../../..; pwd) -export PATH=$PWD/utils/:$KALDI_ROOT/src/bin:$KALDI_ROOT/tools/openfst/bin:$KALDI_ROOT/src/fstbin/:$KALDI_ROOT/src/gmmbin/:$KALDI_ROOT/src/featbin/:$KALDI_ROOT/src/lm/:$KALDI_ROOT/src/sgmmbin/:$KALDI_ROOT/src/sgmm2bin/:$KALDI_ROOT/src/fgmmbin/:$KALDI_ROOT/src/latbin/:$KALDI_ROOT/src/nnetbin:$KALDI_ROOT/src/nnet2bin:$KALDI_ROOT/src/ivectorbin:$KALDI_ROOT/tools/sph2pipe_v2.5:$PWD:$PATH +export KALDI_ROOT=`pwd`/../../.. +export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$KALDI_ROOT/tools/sph2pipe_v2.5:$PWD:$PATH +[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1 +. $KALDI_ROOT/tools/config/common_path.sh export LC_ALL=C diff --git a/egs/sre10/v2/README.txt b/egs/sre10/v2/README.txt new file mode 100644 index 00000000000..1ba8705e089 --- /dev/null +++ b/egs/sre10/v2/README.txt @@ -0,0 +1,20 @@ + Data required for system development (on top of the data for testing described + in ../README.txt). We use SWBD and the older (prior to 2010) SREs to train the + supervised-GMM and iVector extractor. To create an in-domain system, the SREs + are needed to train the PLDA backend. The TDNN is trained on Fisher English. + + Corpus LDC Catalog No. + SWBD2 Phase 2 LDC99S79 + SWBD2 Phase 3 LDC2002S06 + SWBD Cellular 1 LDC2001S13 + SWBD Ceullar 2 LDC2004S07 + SRE2004 LDC2006S44 + SRE2005 Train LDC2011S01 + SRE2005 Test LDC2011S04 + SRE2006 Train LDC2011S09 + SRE2006 Test 1 LDC2011S10 + SRE2006 Test 2 LDC2012S01 + SRE2008 Train LDC2011S05 + SRE2008 Test LDC2011S08 + Fisher speech LDC2004S13, LDC2005S13 + Fisher test LDC2004T19, LDC2005T19 diff --git a/egs/sre10/v2/path.sh b/egs/sre10/v2/path.sh index 7cf73af8c53..e50f57c5271 100755 --- a/egs/sre10/v2/path.sh +++ b/egs/sre10/v2/path.sh @@ -1,3 +1,5 @@ -export KALDI_ROOT=$(cd ../../..; pwd) -export PATH=$PWD/utils/:$KALDI_ROOT/src/bin:$KALDI_ROOT/tools/openfst/bin:$KALDI_ROOT/src/fstbin/:$KALDI_ROOT/src/gmmbin/:$KALDI_ROOT/src/featbin/:$KALDI_ROOT/src/lm/:$KALDI_ROOT/src/sgmmbin/:$KALDI_ROOT/src/sgmm2bin/:$KALDI_ROOT/src/fgmmbin/:$KALDI_ROOT/src/latbin/:$KALDI_ROOT/src/nnetbin:$KALDI_ROOT/src/nnet2bin:$KALDI_ROOT/src/ivectorbin:$KALDI_ROOT/tools/sph2pipe_v2.5:$PWD:$PATH +export KALDI_ROOT=`pwd`/../../.. +export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$KALDI_ROOT/tools/sph2pipe_v2.5:$PWD:$PATH +[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1 +. $KALDI_ROOT/tools/config/common_path.sh export LC_ALL=C diff --git a/egs/sre10/v2/run.sh b/egs/sre10/v2/run.sh index aaa79a749b9..80d1bcf5944 100755 --- a/egs/sre10/v2/run.sh +++ b/egs/sre10/v2/run.sh @@ -8,7 +8,7 @@ # Results (EERs) are inline in comments below. # # This example script shows how to replace the GMM-UBM -# with a DNN trained for ASR. It also demonstrates the +# with a DNN trained for ASR. It also demonstrates the # using the DNN to create a supervised-GMM. . cmd.sh @@ -21,9 +21,6 @@ trials_male=data/sre10_test_male/trials trials=data/sre10_test/trials nnet=exp/nnet2_online/nnet_ms_a/final.mdl -# Use nnet-am-info to determine the size of the output layer. -num_components=5297 - # Train a DNN on about 1800 hours of the english portion of Fisher. local/dnn/train_dnn.sh @@ -66,16 +63,17 @@ steps/make_mfcc.sh --mfcc-config conf/mfcc.conf --nj 40 --cmd "$train_cmd" \ data/sre10_test exp/make_mfcc $mfccdir # Extract DNN features. -steps/make_mfcc.sh --mfcc-config conf/mfcc_hires.conf --nj 40 --cmd "$train_cmd" \ - data/train_dnn exp/make_mfcc $mfccdir -steps/make_mfcc.sh --mfcc-config conf/mfcc_hires.conf --nj 40 --cmd "$train_cmd" \ - data/sre_dnn exp/make_mfcc $mfccdir -steps/make_mfcc.sh --mfcc-config conf/mfcc_hires.conf --nj 40 --cmd "$train_cmd" \ - data/sre10_train_dnn exp/make_mfcc $mfccdir -steps/make_mfcc.sh --mfcc-config conf/mfcc_hires.conf --nj 40 --cmd "$train_cmd" \ - data/sre10_test_dnn exp/make_mfcc $mfccdir - -for name in sre_dnn sre10_train_dnn sre10_test_dnn train_dnn sre sre10_train sre10_test train; do +steps/make_mfcc.sh --mfcc-config conf/mfcc_hires.conf --nj 40 \ + --cmd "$train_cmd" data/train_dnn exp/make_mfcc $mfccdir +steps/make_mfcc.sh --mfcc-config conf/mfcc_hires.conf --nj 40 \ + --cmd "$train_cmd" data/sre_dnn exp/make_mfcc $mfccdir +steps/make_mfcc.sh --mfcc-config conf/mfcc_hires.conf --nj 40 \ + --cmd "$train_cmd" data/sre10_train_dnn exp/make_mfcc $mfccdir +steps/make_mfcc.sh --mfcc-config conf/mfcc_hires.conf --nj 40 \ + --cmd "$train_cmd" data/sre10_test_dnn exp/make_mfcc $mfccdir + +for name in sre_dnn sre10_train_dnn sre10_test_dnn train_dnn sre \ + sre10_train sre10_test train; do utils/fix_data_dir.sh data/${name} done @@ -89,7 +87,7 @@ sid/compute_vad_decision.sh --nj 40 --cmd "$train_cmd" \ sid/compute_vad_decision.sh --nj 40 --cmd "$train_cmd" \ data/sre10_test exp/make_vad $vaddir -for name sre sre10_train sre10_test train; do +for name in sre sre10_train sre10_test train; do cp data/${name}/vad.scp data/${name}_dnn/vad.scp cp data/${name}/utt2spk data/${name}_dnn/utt2spk cp data/${name}/spk2utt data/${name}_dnn/spk2utt @@ -100,25 +98,27 @@ done # Subset training data for faster sup-GMM initialization. utils/subset_data_dir.sh data/train_dnn 32000 data/train_dnn_32k utils/fix_data_dir.sh data/train_dnn_32k -utils/subset_data_dir.sh --utt-list data/train_dnn_32k/utt2spk data/train data/train_32k +utils/subset_data_dir.sh --utt-list data/train_dnn_32k/utt2spk data/train \ + data/train_32k utils/fix_data_dir.sh data/train_32k # Initialize a full GMM from the DNN posteriors and speaker recognition # features. This can be used both alone, as a UBM, or to initialize the # i-vector extractor in a DNN-based system. sid/init_full_ubm_from_dnn.sh --cmd "$train_cmd -l mem_free=6G,ram_free=6G" \ - --num_components $num_components \ data/train_32k \ data/train_dnn_32k $nnet exp/full_ubm -# Train an i-vector extractor based on just the supervised-GMM. -sid/train_ivector_extractor.sh --cmd "$train_cmd -l mem_free=70G,ram_free=70G" \ +# Train an i-vector extractor based on just the supervised-GMM. +sid/train_ivector_extractor.sh \ + --cmd "$train_cmd -l mem_free=70G,ram_free=70G" \ --ivector-dim 600 \ --num-iters 5 exp/full_ubm/final.ubm data/train \ exp/extractor_sup_gmm # Train an i-vector extractor based on the DNN-UBM. -sid/train_ivector_extractor_dnn.sh --cmd "$train_cmd -l mem_free=80G,ram_free=80G" \ +sid/train_ivector_extractor_dnn.sh \ + --cmd "$train_cmd -l mem_free=80G,ram_free=80G" \ --min-post 0.015 \ --ivector-dim 600 \ --num-iters 5 exp/full_ubm/final.ubm $nnet \ @@ -127,34 +127,40 @@ sid/train_ivector_extractor_dnn.sh --cmd "$train_cmd -l mem_free=80G,ram_free=80 exp/extractor_dnn # Extract i-vectors from the extractor with the sup-GMM UBM. -sid/extract_ivectors.sh --cmd "$train_cmd -l mem_free=8G,ram_free=8G" --nj 50 \ +sid/extract_ivectors.sh \ + --cmd "$train_cmd -l mem_free=8G,ram_free=8G" --nj 40 \ exp/extractor_sup_gmm data/sre10_train \ exp/ivectors_sre10_train_sup_gmm -sid/extract_ivectors.sh --cmd "$train_cmd -l mem_free=8G,ram_free=8G" --nj 50 \ +sid/extract_ivectors.sh \ + --cmd "$train_cmd -l mem_free=8G,ram_free=8G" --nj 40 \ exp/extractor_sup_gmm data/sre10_test \ exp/ivectors_sre10_test_sup_gmm -sid/extract_ivectors.sh --cmd "$train_cmd -l mem_free=8G,ram_free=8G" --nj 50 \ +sid/extract_ivectors.sh \ + --cmd "$train_cmd -l mem_free=8G,ram_free=8G" --nj 40 \ exp/extractor_sup_gmm data/sre \ exp/ivectors_sre_sup_gmm # Extract i-vectors using the extractor with the DNN-UBM. -sid/extract_ivectors_dnn.sh --cmd "$train_cmd -l mem_free=10G,ram_free=10G" --nj 40 \ +sid/extract_ivectors_dnn.sh \ + --cmd "$train_cmd -l mem_free=10G,ram_free=10G" --nj 40 \ exp/extractor_dnn \ $nnet \ data/sre10_test \ data/sre10_test_dnn \ exp/ivectors10_test_dnn -sid/extract_ivectors_dnn.sh --cmd "$train_cmd -l mem_free=10G,ram_free=10G" --nj 40 \ +sid/extract_ivectors_dnn.sh + --cmd "$train_cmd -l mem_free=10G,ram_free=10G" --nj 40 \ exp/extractor_dnn \ $nnet \ data/sre10_train \ data/sre10_train_dnn \ exp/ivectors10_train_dnn -sid/extract_ivectors_dnn.sh --cmd "$train_cmd -l mem_free=10G,ram_free=10G" --nj 40 \ +sid/extract_ivectors_dnn.sh + --cmd "$train_cmd -l mem_free=10G,ram_free=10G" --nj 40 \ exp/extractor_dnn \ $nnet \ data/sre \ @@ -172,7 +178,7 @@ local/scoring_common.sh data/sre data/sre10_train data/sre10_test \ exp/ivectors_sre10_test_dnn # The commented out scripts show how to do cosine scoring with and without -# first reducing the i-vector dimensionality with LDA. PLDA tends to work +# first reducing the i-vector dimensionality with LDA. PLDA tends to work # best, so we don't focus on the scores obtained here. # # local/cosine_scoring.sh data/sre10_train data/sre10_test \ diff --git a/egs/swahili/s5/local/prepare_lm.sh b/egs/swahili/s5/local/prepare_lm.sh index 3d52417ca19..028aaa421f2 100755 --- a/egs/swahili/s5/local/prepare_lm.sh +++ b/egs/swahili/s5/local/prepare_lm.sh @@ -4,13 +4,5 @@ cd data #convert to FST format for Kaldi -cat local/swahili.arpa | ../utils/find_arpa_oovs.pl lang/words.txt > lang/oovs.txt -cat local/swahili.arpa | \ - grep -v ' ' | \ - grep -v ' ' | \ - grep -v ' ' | \ - arpa2fst - | fstprint | \ - ../utils/remove_oovs.pl lang/oovs.txt | \ - ../utils/eps2disambig.pl | ../utils/s2eps.pl | fstcompile --isymbols=lang/words.txt \ - --osymbols=lang/words.txt --keep_isymbols=false --keep_osymbols=false | \ - fstrmepsilon > lang/G.fst +arpa2fst --disambig-symbol=#0 --read-symbol-table=lang/words.txt \ + local/swahili.arpa lang/G.fst diff --git a/egs/swahili/s5/path.sh b/egs/swahili/s5/path.sh index 3dc94fa8313..8b61dce675e 100755 --- a/egs/swahili/s5/path.sh +++ b/egs/swahili/s5/path.sh @@ -1,11 +1,16 @@ #!/bin/bash +export KALDI_ROOT=`pwd`/../../.. +[ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh +export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH +#export PATH=$PWD/utils/:$PWD/steps/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH +[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1 +. $KALDI_ROOT/tools/config/common_path.sh +export LC_ALL=C DATA_DIR=$PWD/data LEXICON=$DATA_DIR/local/dict/lexicon.txt EXP_DIR="dev test" TRAIN_DIR="train" -export KALDI_ROOT=`pwd`/../../.. -export PATH=$PWD/utils/:$PWD/steps/:$KALDI_ROOT/src/bin:$KALDI_ROOT/tools/openfst/bin:$KALDI_ROOT/src/fstbin/:$KALDI_ROOT/src/gmmbin/:$KALDI_ROOT/src/featbin/:$KALDI_ROOT/src/lm/:$KALDI_ROOT/src/sgmmbin/:$KALDI_ROOT/src/sgmm2bin/:$KALDI_ROOT/src/fgmmbin/:$KALDI_ROOT/src/latbin/:$KALDI_ROOT/src/nnetbin:$KALDI_ROOT/src/nnet2bin/:$KALDI_ROOT/src/kwsbin:$KALDI_ROOT/src/online2bin/:$KALDI_ROOT/src/ivectorbin/:$PWD:$PATH export LC_ALL=C diff --git a/egs/swbd/s5/cmd.sh b/egs/swbd/s5/cmd.sh index 4abf8546b0d..bae7f5cdf45 100644 --- a/egs/swbd/s5/cmd.sh +++ b/egs/swbd/s5/cmd.sh @@ -1,28 +1,16 @@ -# "queue.pl" uses qsub. The options to it are -# options to qsub. If you have GridEngine installed, -# change this to a queue you have access to. -# Otherwise, use "run.pl", which will run jobs locally -# (make sure your --num-jobs options are no more than -# the number of cpus on your machine. - -#a) JHU cluster options -export train_cmd="queue.pl -l arch=*64*" -export decode_cmd="queue.pl -l arch=*64* -l ram_free=4G,mem_free=4G" -#export cuda_cmd="..." -export mkgraph_cmd="queue.pl -l arch=*64* -l ram_free=4G,mem_free=4G" - -#b) BUT cluster options -#export train_cmd="queue.pl -q all.q@@blade -l ram_free=1200M,mem_free=1200M" -#export decode_cmd="queue.pl -q all.q@@blade -l ram_free=1700M,mem_free=1700M" -#export decodebig_cmd="queue.pl -q all.q@@blade -l ram_free=4G,mem_free=4G" -#export cuda_cmd="queue.pl -q long.q@@pco203 -l gpu=1" -#export cuda_cmd="queue.pl -q long.q@pcspeech-gpu" -#export mkgraph_cmd="queue.pl -q all.q@@servers -l ram_free=4G,mem_free=4G" - -#c) run it locally... -#export train_cmd=run.pl -#export decode_cmd=run.pl -export cuda_cmd=run.pl -#export mkgraph_cmd=run.pl +# you can change cmd.sh depending on what type of queue you are using. +# If you have no queueing system and want to run on a local machine, you +# can change all instances 'queue.pl' to run.pl (but be careful and run +# commands one by one: most recipes will exhaust the memory on your +# machine). queue.pl works with GridEngine (qsub). slurm.pl works +# with slurm. Different queues are configured differently, with different +# queue names and different ways of specifying things like memory; +# to account for these differences you can create and edit the file +# conf/queue.conf to match your queue's configuration. Search for +# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, +# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. +export train_cmd="queue.pl" +export decode_cmd="queue.pl --mem 4G" +export mkgraph_cmd="queue.pl --mem 8G" diff --git a/egs/swbd/s5/local/swbd_p1_format_data.sh b/egs/swbd/s5/local/swbd_p1_format_data.sh index f0d38a08dd2..69ad44ccc50 100755 --- a/egs/swbd/s5/local/swbd_p1_format_data.sh +++ b/egs/swbd/s5/local/swbd_p1_format_data.sh @@ -1,4 +1,4 @@ -#!/bin/bash +#!/bin/bash # if [ -f path.sh ]; then . path.sh; fi @@ -20,26 +20,13 @@ done rm -r data/lang_test cp -r data/lang data/lang_test -# grep -v ' ' etc. is only for future-proofing this script. Our -# LM doesn't have these "invalid combinations". These can cause -# determinization failures of CLG [ends up being epsilon cycles]. -# Note: remove_oovs.pl takes a list of words in the LM that aren't in -# our word list. Since our LM doesn't have any, we just give it -# /dev/null [we leave it in the script to show how you'd do it]. gunzip -c "$arpa_lm" | \ - grep -v ' ' | \ - grep -v ' ' | \ - grep -v ' ' | \ - arpa2fst - | fstprint | \ - utils/remove_oovs.pl /dev/null | \ - utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=data/lang_test/words.txt \ - --osymbols=data/lang_test/words.txt --keep_isymbols=false --keep_osymbols=false | \ - fstrmepsilon | fstarcsort --sort_type=ilabel > data/lang_test/G.fst - fstisstochastic data/lang_test/G.fst + arpa2fst --disambig-symbol=#0 \ + --read-symbol-table=data/lang_test/words.txt - data/lang_test/G.fst echo "Checking how stochastic G is (the first of these numbers should be small):" -fstisstochastic data/lang_test/G.fst +fstisstochastic data/lang_test/G.fst ## Check lexicon. ## just have a look and make sure it seems sane. @@ -68,4 +55,3 @@ fsttablecompose data/lang/L_disambig.fst data/lang_test/G.fst | \ echo swbd_p1_format_data succeeded. - diff --git a/egs/swbd/s5/path.sh b/egs/swbd/s5/path.sh index e1d916917f1..50eedcbb1f4 100755 --- a/egs/swbd/s5/path.sh +++ b/egs/swbd/s5/path.sh @@ -1,6 +1,8 @@ export KALDI_ROOT=`pwd`/../../.. [ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh #export KALDI_ROOT=/home/dpovey/kaldi-trunk-test -export PATH=$PWD/utils/:$KALDI_ROOT/src/bin:$KALDI_ROOT/tools/openfst/bin:$KALDI_ROOT/src/fstbin/:$KALDI_ROOT/src/gmmbin/:$KALDI_ROOT/src/featbin/:$KALDI_ROOT/src/lm/:$KALDI_ROOT/src/sgmmbin/:$KALDI_ROOT/src/sgmm2bin/:$KALDI_ROOT/src/fgmmbin/:$KALDI_ROOT/src/latbin/:$KALDI_ROOT/src/nnetbin:$KALDI_ROOT/src/nnet2bin:$PWD:$PATH +export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH +[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1 +. $KALDI_ROOT/tools/config/common_path.sh export LC_ALL=C export MKL_NUM_THREADS=16 diff --git a/egs/swbd/s5b/cmd.sh b/egs/swbd/s5b/cmd.sh index 4abf8546b0d..575407ac0ff 100644 --- a/egs/swbd/s5b/cmd.sh +++ b/egs/swbd/s5b/cmd.sh @@ -1,28 +1,15 @@ -# "queue.pl" uses qsub. The options to it are -# options to qsub. If you have GridEngine installed, -# change this to a queue you have access to. -# Otherwise, use "run.pl", which will run jobs locally -# (make sure your --num-jobs options are no more than -# the number of cpus on your machine. - -#a) JHU cluster options -export train_cmd="queue.pl -l arch=*64*" -export decode_cmd="queue.pl -l arch=*64* -l ram_free=4G,mem_free=4G" -#export cuda_cmd="..." -export mkgraph_cmd="queue.pl -l arch=*64* -l ram_free=4G,mem_free=4G" - -#b) BUT cluster options -#export train_cmd="queue.pl -q all.q@@blade -l ram_free=1200M,mem_free=1200M" -#export decode_cmd="queue.pl -q all.q@@blade -l ram_free=1700M,mem_free=1700M" -#export decodebig_cmd="queue.pl -q all.q@@blade -l ram_free=4G,mem_free=4G" -#export cuda_cmd="queue.pl -q long.q@@pco203 -l gpu=1" -#export cuda_cmd="queue.pl -q long.q@pcspeech-gpu" -#export mkgraph_cmd="queue.pl -q all.q@@servers -l ram_free=4G,mem_free=4G" - -#c) run it locally... -#export train_cmd=run.pl -#export decode_cmd=run.pl -export cuda_cmd=run.pl -#export mkgraph_cmd=run.pl - - +# you can change cmd.sh depending on what type of queue you are using. +# If you have no queueing system and want to run on a local machine, you +# can change all instances 'queue.pl' to run.pl (but be careful and run +# commands one by one: most recipes will exhaust the memory on your +# machine). queue.pl works with GridEngine (qsub). slurm.pl works +# with slurm. Different queues are configured differently, with different +# queue names and different ways of specifying things like memory; +# to account for these differences you can create and edit the file +# conf/queue.conf to match your queue's configuration. Search for +# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, +# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. + +export train_cmd="queue.pl" +export decode_cmd="queue.pl --mem 4G" +export mkgraph_cmd="queue.pl --mem 8G" diff --git a/egs/swbd/s5b/local/nnet/run_dnn_tandem_uc.sh b/egs/swbd/s5b/local/nnet/run_dnn_tandem_uc.sh index 23c4945a8e7..06ea344be4d 100755 --- a/egs/swbd/s5b/local/nnet/run_dnn_tandem_uc.sh +++ b/egs/swbd/s5b/local/nnet/run_dnn_tandem_uc.sh @@ -89,7 +89,7 @@ if [ $stage -le 4 ]; then dir=exp/nnet5b_uc-part1 feature_transform=$dir/final.feature_transform.part1 nnet-concat $dir/final.feature_transform \ - "nnet-copy --remove-last-layers=4 --binary=false $dir/final.nnet - |" \ + "nnet-copy --remove-last-components=4 --binary=false $dir/final.nnet - |" \ "utils/nnet/gen_splice.py --fea-dim=80 --splice=2 --splice-step=5 |" \ $feature_transform || exit 1 diff --git a/egs/swbd/s5b/local/nnet2/run_5c_gpu.sh b/egs/swbd/s5b/local/nnet2/run_5c_gpu.sh index a8eef429fe1..36f72b77083 100755 --- a/egs/swbd/s5b/local/nnet2/run_5c_gpu.sh +++ b/egs/swbd/s5b/local/nnet2/run_5c_gpu.sh @@ -20,7 +20,7 @@ EOF ( if [ ! -f exp/nnet5c_gpu/final.mdl ]; then - steps/nnet2/train_tanh.sh --cmd "$decode_cmd" --parallel-opts "-l gpu=1" --io-opts "-tc 5" \ + steps/nnet2/train_tanh.sh --cmd "$decode_cmd" --parallel-opts "-l gpu=1" --io-opts "--max-jobs-run 5" \ --num-threads 1 --minibatch-size 512 --max-change 40.0 --mix-up 20000 --samples-per-iter 300000 \ --num-epochs 10 --num-epochs-extra 3 --initial-learning-rate 0.0067 --final-learning-rate 0.00067 \ --num-jobs-nnet 10 --num-hidden-layers 5 --hidden-layer-dim 1536 data/train_nodup data/lang \ diff --git a/egs/swbd/s5b/local/online/run_nnet2_ms_disc.sh b/egs/swbd/s5b/local/online/run_nnet2_ms_disc.sh index 1ed461027e1..dc56a8371fb 100755 --- a/egs/swbd/s5b/local/online/run_nnet2_ms_disc.sh +++ b/egs/swbd/s5b/local/online/run_nnet2_ms_disc.sh @@ -108,14 +108,14 @@ if [ $stage -le 3 ]; then if [ -d ${srcdir}_degs/storage ]; then max_jobs=10; else max_jobs=5; fi steps/nnet2/get_egs_discriminative2.sh \ - --cmd "$decode_cmd -tc $max_jobs" \ + --cmd "$decode_cmd --max-jobs-run $max_jobs" \ --online-ivector-dir exp/nnet2_online/ivectors_train_hires_nodup2 \ --criterion $criterion --drop-frames $drop_frames \ data/train_hires_nodup data/lang ${srcdir}{_ali,_denlats,/final.mdl,_degs} || exit 1; # the command below is a more generic, but slower, way to do it. #steps/online/nnet2/get_egs_discriminative2.sh \ - # --cmd "$decode_cmd -tc $max_jobs" \ + # --cmd "$decode_cmd --max-jobs-run $max_jobs" \ # --criterion $criterion --drop-frames $drop_frames \ # data/train_960 data/lang ${srcdir}{_ali,_denlats,_online,_degs} || exit 1; fi diff --git a/egs/swbd/s5b/path.sh b/egs/swbd/s5b/path.sh index db666cc10f6..2d17b17a84a 100755 --- a/egs/swbd/s5b/path.sh +++ b/egs/swbd/s5b/path.sh @@ -1,4 +1,6 @@ export KALDI_ROOT=`pwd`/../../.. -export PATH=$PWD/utils/:$KALDI_ROOT/src/bin:$KALDI_ROOT/tools/openfst/bin:$KALDI_ROOT/src/fstbin/:$KALDI_ROOT/src/gmmbin/:$KALDI_ROOT/src/featbin/:$KALDI_ROOT/src/lm/:$KALDI_ROOT/src/sgmmbin/:$KALDI_ROOT/src/sgmm2bin/:$KALDI_ROOT/src/fgmmbin/:$KALDI_ROOT/src/latbin/:$KALDI_ROOT/src/nnetbin:$KALDI_ROOT/src/nnet2bin:$KALDI_ROOT/src/online2bin/:$KALDI_ROOT/src/ivectorbin/:$KALDI_ROOT/src/lmbin/:$PWD:$PATH -#$KALDI_ROOT/tools/srilm/bin:$KALDI_ROOT/tools/srilm/bin/i686-m64:$KALDI_ROOT/tools/srilm/bin/i686:$PATH +[ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh +export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH +[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1 +. $KALDI_ROOT/tools/config/common_path.sh export LC_ALL=C diff --git a/egs/swbd/s5c/RESULTS b/egs/swbd/s5c/RESULTS index 5302ca6d700..4a95ae7c7a4 100644 --- a/egs/swbd/s5c/RESULTS +++ b/egs/swbd/s5c/RESULTS @@ -1,3 +1,10 @@ +#!/bin/bash +# eval2000, +for x in exp/{mono,tri,sgmm,nnet,dnn,lstm}*/decode*; do [ -d $x ] && [[ $x =~ "$1" ]] && grep Sum $x/score_*/*.ctm.filt.sys | utils/best_wer.sh; done 2>/dev/null +# swbd subset of eval2000, +for x in exp/{mono,tri,sgmm,nnet,dnn,lstm}*/decode*; do [ -d $x ] && [[ $x =~ "$1" ]] && grep Sum $x/score_*/*.ctm.swbd.filt.sys | utils/best_wer.sh; done 2>/dev/null +exit 0 + # Note: we report the overall eval2000 performance and the Switchboard portion # of eval2000 (without CallHome) performance separately below. @@ -98,10 +105,105 @@ %WER 14.5 | 1831 21395 | 86.8 8.5 4.6 1.3 14.5 52.4 | exp/nnet2_online/nnet_ms_b_online/decode_eval2000_hires_sw1_tg/score_12/eval2000_hires.ctm.swbd.filt.sys %WER 14.8 | 1831 21395 | 86.7 9.0 4.3 1.6 14.8 52.8 | exp/nnet2_online/nnet_ms_b_online/decode_eval2000_hires_sw1_tg_per_utt/score_10/eval2000_hires.ctm.swbd.filt.sys + +( +# old results with 25 million parameter model. We do not want to use such a big model. So see the new results below +# local/nnet3/run_lstm.sh +# these are results with nnet3 LSTMs cell_dim=1280, recurrent_dim=384, lstm_delay=-1 -2 -3, label_delay=5 num_params=25010228 (8 epoch training on speed-perturbed +# and volume perturbed data) +%WER 11.4 | 1831 21395 | 89.8 6.8 3.4 1.2 11.4 46.0 | exp/nnet3/lstm_ld5_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.swbd.filt.sys +%WER 12.7 | 1831 21395 | 88.6 7.6 3.8 1.3 12.7 48.7 | exp/nnet3/lstm_ld5_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.swbd.filt.sys +%WER 16.8 | 4459 42989 | 85.1 10.4 4.5 1.9 16.8 52.8 | exp/nnet3/lstm_ld5_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys +%WER 18.1 | 4459 42989 | 84.0 11.2 4.8 2.0 18.1 54.9 | exp/nnet3/lstm_ld5_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +%WER 22.0 | 2628 21594 | 80.5 13.9 5.6 2.5 22.0 57.3 | exp/nnet3/lstm_ld5_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.callhm.filt.sys +%WER 23.3 | 2628 21594 | 79.4 14.7 6.0 2.7 23.3 59.2 | exp/nnet3/lstm_ld5_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.callhm.filt.sys +) + + +# local/nnet3/run_lstm.sh +# these are results with nnet3 LSTMs cell_dim=1024, recurrent_dim=256, nonrecurrent_projection_dim=256, lstm_delay=-1 -2 -3, label_delay=5 num_params=14.6M (8 epoch training on speed-perturbed +# this setup has the newly introduced feature self-repair, in addition to shrink +%WER 11.6 | 1831 21395 | 89.7 6.9 3.4 1.3 11.6 46.9 | exp/nnet3/lstm_ld5_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.swbd.filt.sys +%WER 12.6 | 1831 21395 | 88.7 7.6 3.7 1.4 12.6 49.6 | exp/nnet3/lstm_ld5_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.swbd.filt.sys +%WER 21.3 | 2628 21594 | 81.0 13.2 5.8 2.4 21.3 57.3 | exp/nnet3/lstm_ld5_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.callhm.filt.sys +%WER 23.1 | 2628 21594 | 79.5 14.7 5.8 2.6 23.1 59.6 | exp/nnet3/lstm_ld5_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.callhm.filt.sys +%WER 16.5 | 4459 42989 | 85.3 10.1 4.6 1.8 16.5 53.0 | exp/nnet3/lstm_ld5_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +%WER 17.9 | 4459 42989 | 84.1 11.2 4.8 2.0 17.9 55.5 | exp/nnet3/lstm_ld5_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +%WER 15.17 [ 7466 / 49204, 993 ins, 1937 del, 4536 sub ] exp/nnet3/lstm_ld5_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0 +%WER 16.12 [ 7931 / 49204, 1072 ins, 1910 del, 4949 sub ] exp/nnet3/lstm_ld5_sp/decode_train_dev_sw1_tg/wer_11_0.0 + + +# bidirectional LSTM +# ----------------------- +# local/nnet3/run_lstm.sh --affix bidirectional \ +# --lstm-delay " [-1,1] [-2,2] [-3,3] " \ +# --label-delay 0 \ +# --cell-dim 1024 \ +# --recurrent-projection-dim 128 \ +# --non-recurrent-projection-dim 128 \ +# --chunk-left-context 40 \ +# --chunk-right-context 40 +# (8 epoch training on speed-perturbed and volume perturbed data) +# num_params=20101172 +%WER 10.3 | 1831 21395 | 90.6 6.1 3.2 0.9 10.3 44.2 | exp/nnet3/lstm_bidirectional_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.swbd.filt.sys +%WER 11.3 | 1831 21395 | 89.6 6.9 3.5 1.0 11.3 46.6 | exp/nnet3/lstm_bidirectional_sp/decode_eval2000_sw1_tg/score_11_1.0/eval2000_hires.ctm.swbd.filt.sys +%WER 14.9 | 4459 42989 | 86.6 9.1 4.3 1.5 14.9 50.6 | exp/nnet3/lstm_bidirectional_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys +%WER 16.1 | 4459 42989 | 85.5 10.1 4.5 1.6 16.1 52.7 | exp/nnet3/lstm_bidirectional_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +%WER 19.4 | 2628 21594 | 82.7 12.0 5.3 2.1 19.4 54.9 | exp/nnet3/lstm_bidirectional_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.callhm.filt.sys +%WER 20.8 | 2628 21594 | 81.3 13.1 5.6 2.2 20.8 56.9 | exp/nnet3/lstm_bidirectional_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.callhm.filt.sys + +# results with nnet3 tdnn: local/nnet3/run_tdnn.sh (11.10.2015) (2 epoch training on speed-perturbed and volume perturbed data) +%WER 12.1 | 1831 21395 | 89.1 7.1 3.8 1.3 12.1 48.1 | exp/nnet3/tdnn_sp/decode_eval2000_hires_sw1_fsh_fg/score_12_0.0/eval2000_hires.ctm.swbd.filt.sys +%WER 13.6 | 1831 21395 | 87.9 8.2 3.9 1.5 13.6 51.0 | exp/nnet3/tdnn_sp/decode_eval2000_hires_sw1_tg/score_11_0.0/eval2000_hires.ctm.swbd.filt.sys + +# results with nnet3 cnn+tdnn: local/nnet3/run_tdnn.sh --use_cnn true (1.2.2016) (2 epoch training on speed-perturbed and volume perturbed data) +%WER 12.0 | 1831 21395 | 89.3 7.1 3.7 1.3 12.0 47.1 | exp/nnet3/tdnn_cnn_sp/decode_eval2000_hires_sw1_fsh_fg/score_12_0.0/eval2000_hires.ctm.swbd.filt.sys +%WER 22.4 | 2628 21594 | 80.1 13.7 6.2 2.6 22.4 57.6 | exp/nnet3/tdnn_cnn_sp/decode_eval2000_hires_sw1_fsh_fg/score_11_1.0/eval2000_hires.ctm.callhm.filt.sys +%WER 17.3 | 4459 42989 | 84.7 10.5 4.9 2.0 17.3 53.5 | exp/nnet3/tdnn_cnn_sp/decode_eval2000_hires_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +%WER 13.5 | 1831 21395 | 88.0 8.1 3.9 1.5 13.5 49.4 | exp/nnet3/tdnn_cnn_sp/decode_eval2000_hires_sw1_tg/score_11_0.0/eval2000_hires.ctm.swbd.filt.sys +%WER 24.3 | 2628 21594 | 78.6 15.0 6.4 2.9 24.3 60.0 | exp/nnet3/tdnn_cnn_sp/decode_eval2000_hires_sw1_tg/score_10_0.0/eval2000_hires.ctm.callhm.filt.sys +%WER 18.9 | 4459 42989 | 83.2 11.5 5.3 2.2 18.9 55.6 | exp/nnet3/tdnn_cnn_sp/decode_eval2000_hires_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys + + +# current best 'chain' models with TDNNs (see local/chain/run_tdnn_2o.sh) +%WER 11.3 | 1831 21395 | 90.0 6.8 3.2 1.3 11.3 46.6 | exp/chain/tdnn_2o_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.swbd.filt.sys +%WER 13.0 | 1831 21395 | 88.6 7.9 3.6 1.6 13.0 50.4 | exp/chain/tdnn_2o_sp/decode_eval2000_sw1_tg/score_10_0.5/eval2000_hires.ctm.swbd.filt.sys + +# current best 'chain' models with LSTM (see local/chain/run_lstm_d.sh) +%WER 10.5 | 1831 21395 | 90.8 6.4 2.9 1.3 10.5 44.3 | exp/chain/lstm_d_ld5_sp/decode_eval2000_sw1_fsh_fg/score_10_1.0/eval2000_hires.ctm.swbd.filt.sys +%WER 15.9 | 4459 42989 | 86.0 9.6 4.3 2.0 15.9 51.7 | exp/chain/lstm_d_ld5_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys +%WER 21.2 | 2628 21594 | 81.4 12.8 5.9 2.6 21.2 56.7 | exp/chain/lstm_d_ld5_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.callhm.filt.sys +%WER 13.88 [ 6829 / 49204, 935 ins, 1690 del, 4204 sub ] exp/chain/lstm_d_ld5_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0 + +# these are results with nnet3 LSTMs with CTC training : local/ctc/run_lstm.sh +%WER 17.4 | 1831 21395 | 85.3 10.1 4.6 2.7 17.4 57.8 | exp/ctc/lstm_sp/decode_eval2000_sw1_fsh_fg_0.15/score_12_0.0/eval2000_hires.ctm.swbd.filt.sys +%WER 19.4 | 1831 21395 | 83.5 11.2 5.2 3.0 19.4 60.7 | exp/ctc/lstm_sp/decode_eval2000_sw1_tg_0.15/score_12_0.5/eval2000_hires.ctm.swbd.filt.sys + + # Resegmentation numbers for swbd subset. %WER 22.4 | 1831 21395 | 79.9 13.4 6.6 2.4 22.4 61.4 | exp/tri4a_reseg/decode_eval2000_sw1_tg/score_15/eval2000.ctm.swbd.filt.sys %WER 30.3 | 1831 21395 | 73.1 19.1 7.9 3.4 30.3 67.5 | exp/tri4a_reseg/decode_eval2000_sw1_tg.si/score_13/eval2000.ctm.swbd.filt.sys -# Raw fmllr numbers for swbd subset. +# Raw fmllr numbers for swbd subset. %WER 22.1 | 1831 21395 | 80.1 14.1 5.8 2.2 22.1 59.8 | exp/tri4b/decode_eval2000_sw1_tg/score_13/eval2000.ctm.swbd.filt.sys %WER 30.1 | 1831 21395 | 72.7 19.5 7.9 2.8 30.1 65.4 | exp/tri4b/decode_eval2000_sw1_tg.si/score_14/eval2000.ctm.swbd.filt.sys + + +### Karel's nnet1 +# nnet1 DNN recipe (29.09.2015), swbd subset, +# cross-entropy (3gram decoding, fisher 4gram rescoring), +%WER 14.6 | 1831 21395 | 87.0 8.9 4.2 1.6 14.6 52.3 | exp/dnn5b_pretrain-dbn_dnn/decode_eval2000_sw1_tg/score_12_0.5/eval2000.ctm.swbd.filt.sys +%WER 13.0 | 1831 21395 | 88.5 7.8 3.7 1.4 13.0 49.5 | exp/dnn5b_pretrain-dbn_dnn/decode_eval2000_sw1_fsh_fg/score_12_0.0/eval2000.ctm.swbd.filt.sys +# sMBR (3gram decoding, fisher 4gram rescoring), +%WER 13.2 | 1831 21395 | 88.5 8.1 3.4 1.7 13.2 48.7 | exp/dnn5b_pretrain-dbn_dnn_smbr/decode_eval2000_sw1_tg_it4/score_14_0.0/eval2000.ctm.swbd.filt.sys +%WER 11.7 | 1831 21395 | 89.9 7.1 3.0 1.6 11.7 45.8 | exp/dnn5b_pretrain-dbn_dnn_smbr/decode_eval2000_sw1_fsh_fg_it4/score_13_0.0/eval2000.ctm.swbd.filt.sys + +# nnet1 Tandem recipe local/nnet/run_dnn_tandem_uc.sh (29.09.2015), swbd subset, +# Stacked bottleneck network, +%WER 15.3 | 1831 21395 | 86.2 9.5 4.3 1.5 15.3 52.6 | exp/nnet5uc-part2/decode_eval2000_sw1_tg/score_13_0.0/eval2000.ctm.swbd.filt.sys +# GMMs on BN-features, +%WER 16.7 | 1831 21395 | 85.0 10.5 4.5 1.8 16.7 54.3 | exp/tri6uc/decode_eval2000_graph_sw1_tg/score_20_0.0/eval2000.ctm.swbd.filt.sys +%WER 15.8 | 1831 21395 | 85.8 9.9 4.4 1.6 15.8 53.2 | exp/tri7uc-sat/decode_eval2000_graph_sw1_tg/score_20_0.0/eval2000.ctm.swbd.filt.sys +%WER 14.6 | 1831 21395 | 87.1 9.2 3.6 1.8 14.6 51.8 | exp/tri7uc-sat_mmi_b0.1/decode_eval2000_graph_sw1_tg_it4/score_17_0.0/eval2000.ctm.swbd.filt.sys +# fisher 4gram rescoring, +%WER 13.2 | 1831 21395 | 88.3 8.2 3.4 1.5 13.2 49.2 | exp/tri7uc-sat_mmi_b0.1/decode_eval2000_graph_sw1_fsh_fg_it4/score_19_0.0/eval2000.ctm.swbd.filt.sys diff --git a/egs/swbd/s5c/callhm.perf b/egs/swbd/s5c/callhm.perf new file mode 100644 index 00000000000..a31a83b32bb --- /dev/null +++ b/egs/swbd/s5c/callhm.perf @@ -0,0 +1,33 @@ +%WER 25.6 | 2628 21594 | 77.8 16.0 6.2 3.4 25.6 63.0 | exp/chain/tdnn_v_sp/decode_eval2000_sw1_fsh_fg/score_11_0.5/eval2000_hires.ctm.callhm.filt.sys +%WER 22.7 | 2628 21594 | 79.9 13.6 6.5 2.6 22.7 59.0 | exp/chain/tdnn_v1_trial4_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.callhm.filt.sys +%WER 22.6 | 2628 21594 | 80.0 13.5 6.5 2.6 22.6 58.5 | exp/chain/tdnn_v1_trial5_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.callhm.filt.sys +%WER 22.6 | 2628 21594 | 80.0 13.2 6.8 2.6 22.6 59.2 | exp/chain/tdnn_v1_trial3_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.callhm.filt.sys +%WER 22.5 | 2628 21594 | 80.1 13.5 6.4 2.6 22.5 58.6 | exp/chain/tdnn_v1_trial1_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.callhm.filt.sys +%WER 22.5 | 2628 21594 | 79.6 12.5 7.9 2.2 22.5 59.1 | exp/chain/tdnn_v2_trial5_sp/decode_eval2000_300_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.callhm.filt.sys +%WER 22.3 | 2628 21594 | 80.3 13.1 6.6 2.6 22.3 59.2 | exp/chain/tdnn_v2_trial5_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.callhm.filt.sys +%WER 22.3 | 2628 21594 | 80.2 13.2 6.6 2.5 22.3 57.9 | exp/chain/tdnn_v2_trial8_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.callhm.filt.sys +%WER 22.3 | 2628 21594 | 80.2 13.2 6.6 2.5 22.3 57.9 | exp/chain/tdnn_v2_trial8_sp/decode_eval2000_400_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.callhm.filt.sys +%WER 22.3 | 2628 21594 | 80.2 13.2 6.6 2.5 22.3 57.9 | exp/chain/tdnn_v1_trial8_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.callhm.filt.sys +%WER 22.3 | 2628 21594 | 80.2 13.2 6.6 2.5 22.3 57.9 | exp/chain/tdnn_v1_trial8_sp/decode_eval2000_400_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.callhm.filt.sys +%WER 22.3 | 2628 21594 | 80.1 13.3 6.6 2.4 22.3 58.4 | exp/chain/tdnn_v2_trial1_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.callhm.filt.sys +%WER 22.2 | 2628 21594 | 80.2 13.0 6.8 2.4 22.2 58.4 | exp/chain/tdnn_v2_trial1_sp/decode_eval2000_400_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.callhm.filt.sys +%WER 22.1 | 2628 21594 | 80.4 13.3 6.3 2.6 22.1 58.5 | exp/chain/tdnn_v2_trial5_sp/decode_eval2000_400_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.callhm.filt.sys +%WER 22.1 | 2628 21594 | 80.4 13.3 6.3 2.5 22.1 58.6 | exp/chain/tdnn_v2_trial8_sp/decode_eval2000_300_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.callhm.filt.sys +%WER 22.1 | 2628 21594 | 80.4 13.3 6.3 2.5 22.1 58.6 | exp/chain/tdnn_v1_trial8_sp/decode_eval2000_300_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.callhm.filt.sys +%WER 22.1 | 2628 21594 | 80.0 12.3 7.7 2.2 22.1 58.0 | exp/chain/tdnn_v2_trial1_sp/decode_eval2000_200_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.callhm.filt.sys +%WER 22.0 | 2628 21594 | 80.6 13.1 6.4 2.5 22.0 58.7 | exp/chain/tdnn_v2_trial3_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.callhm.filt.sys +%WER 22.0 | 2628 21594 | 80.6 13.1 6.3 2.6 22.0 58.5 | exp/chain/tdnn_v2_trial4_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.callhm.filt.sys +%WER 22.0 | 2628 21594 | 80.3 12.5 7.2 2.4 22.0 58.1 | exp/chain/tdnn_v1_trial6_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.callhm.filt.sys +%WER 22.0 | 2628 21594 | 80.3 12.0 7.7 2.2 22.0 57.7 | exp/chain/tdnn_v2_trial8_sp/decode_eval2000_200_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.callhm.filt.sys +%WER 22.0 | 2628 21594 | 80.3 12.0 7.7 2.2 22.0 57.7 | exp/chain/tdnn_v1_trial8_sp/decode_eval2000_200_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.callhm.filt.sys +%WER 21.9 | 2628 21594 | 80.7 13.2 6.1 2.6 21.9 58.1 | exp/chain/tdnn_v2_trial4_sp/decode_eval2000_300_sw1_fsh_fg/score_8_1.0/eval2000_hires.ctm.callhm.filt.sys +%WER 21.9 | 2628 21594 | 80.6 13.3 6.1 2.5 21.9 58.2 | exp/chain/tdnn_v2_trial2_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.callhm.filt.sys +%WER 21.9 | 2628 21594 | 80.6 12.9 6.6 2.5 21.9 58.2 | exp/chain/tdnn_v2_trial3_sp/decode_eval2000_400_sw1_fsh_fg/score_9_1.0/eval2000_hires.ctm.callhm.filt.sys +%WER 21.9 | 2628 21594 | 80.4 12.3 7.3 2.3 21.9 59.1 | exp/chain/tdnn_v2_trial3_sp/decode_eval2000_300_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.callhm.filt.sys +%WER 21.8 | 2628 21594 | 80.6 12.5 6.9 2.4 21.8 58.2 | exp/chain/tdnn_v3_trial1_sp/decode_eval2000_300_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.callhm.filt.sys +%WER 21.8 | 2628 21594 | 80.3 12.6 7.1 2.1 21.8 58.4 | exp/chain/tdnn_v2_trial1_sp/decode_eval2000_300_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.callhm.filt.sys +%WER 21.7 | 2628 21594 | 80.9 13.0 6.1 2.6 21.7 58.2 | exp/chain/tdnn_v3_trial1_sp/decode_eval2000_400_sw1_fsh_fg/score_8_1.0/eval2000_hires.ctm.callhm.filt.sys +%WER 21.7 | 2628 21594 | 80.7 12.9 6.4 2.5 21.7 58.8 | exp/chain/tdnn_v2_trial2_sp/decode_eval2000_400_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.callhm.filt.sys +%WER 21.7 | 2628 21594 | 80.6 12.9 6.5 2.4 21.7 58.2 | exp/chain/tdnn_v2_trial4_sp/decode_eval2000_400_sw1_fsh_fg/score_9_1.0/eval2000_hires.ctm.callhm.filt.sys +%WER 21.6 | 2628 21594 | 80.6 11.9 7.5 2.2 21.6 57.8 | exp/chain/tdnn_v3_trial1_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.callhm.filt.sys +%WER 21.5 | 2628 21594 | 80.7 12.8 6.5 2.3 21.5 58.3 | exp/chain/tdnn_v2_trial2_sp/decode_eval2000_300_sw1_fsh_fg/score_9_0.5/eval2000_hires.ctm.callhm.filt.sys diff --git a/egs/swbd/s5c/cmd.sh b/egs/swbd/s5c/cmd.sh index 036d89a9ea5..d500a690621 100644 --- a/egs/swbd/s5c/cmd.sh +++ b/egs/swbd/s5c/cmd.sh @@ -1,28 +1,29 @@ -# "queue.pl" uses qsub. The options to it are -# options to qsub. If you have GridEngine installed, -# change this to a queue you have access to. -# Otherwise, use "run.pl", which will run jobs locally -# (make sure your --num-jobs options are no more than -# the number of cpus on your machine. +# you can change cmd.sh depending on what type of queue you are using. +# If you have no queueing system and want to run on a local machine, you +# can change all instances 'queue.pl' to run.pl (but be careful and run +# commands one by one: most recipes will exhaust the memory on your +# machine). queue.pl works with GridEngine (qsub). slurm.pl works +# with slurm. Different queues are configured differently, with different +# queue names and different ways of specifying things like memory; +# to account for these differences you can create and edit the file +# conf/queue.conf to match your queue's configuration. Search for +# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, +# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. -#a) JHU cluster options -export train_cmd="queue.pl -l arch=*64*" -export decode_cmd="queue.pl -l arch=*64* --mem 4G" -#export cuda_cmd="..." -export mkgraph_cmd="queue.pl -l arch=*64* --mem 4G" +export train_cmd="queue.pl" +export decode_cmd="queue.pl --mem 4G" +export mkgraph_cmd="queue.pl --mem 8G" +export cuda_cmd="queue.pl --gpu 1" -#b) BUT cluster options -#export train_cmd="queue.pl -q all.q@@blade -l ram_free=1200M,mem_free=1200M" -#export decode_cmd="queue.pl -q all.q@@blade -l ram_free=1700M,mem_free=1700M" -#export decodebig_cmd="queue.pl -q all.q@@blade -l ram_free=4G,mem_free=4G" -#export cuda_cmd="queue.pl -q long.q@@pco203 -l gpu=1" -#export cuda_cmd="queue.pl -q long.q@pcspeech-gpu" -#export mkgraph_cmd="queue.pl -q all.q@@servers -l ram_free=4G,mem_free=4G" - -#c) run it locally... -#export train_cmd=run.pl -#export decode_cmd=run.pl -export cuda_cmd=run.pl -#export mkgraph_cmd=run.pl +# the rest of this file is present for historical reasons. it's better to +# create and edit conf/queue.conf for cluster-specific configuration. +if [ "$(hostname -d)" == "fit.vutbr.cz" ]; then + # BUT cluster: + queue="all.q@@blade,all.q@@speech" + storage="matylda5" + export train_cmd="queue.pl -q $queue -l ram_free=1.5G,mem_free=1.5G,${storage}=0.25" + export decode_cmd="queue.pl -q $queue -l ram_free=2.5G,mem_free=2.5G,${storage}=0.1" + export cuda_cmd="queue.pl -q long.q -l gpu=1" +fi diff --git a/egs/swbd/s5c/conf/decode_online.config b/egs/swbd/s5c/conf/decode_online.config new file mode 100644 index 00000000000..410ca63c28b --- /dev/null +++ b/egs/swbd/s5c/conf/decode_online.config @@ -0,0 +1,2 @@ +beam=11.0 # beam for decoding. +first_beam=8.0 # beam for 1st-pass decoding in SAT. diff --git a/egs/swbd/s5c/conf/mfcc_dbl3.conf b/egs/swbd/s5c/conf/mfcc_dbl3.conf new file mode 100644 index 00000000000..f0e09186f3e --- /dev/null +++ b/egs/swbd/s5c/conf/mfcc_dbl3.conf @@ -0,0 +1,16 @@ +# config for high-resolution MFCC features extracted at double the normal frame +# rate, intended for neural network training. Note: we keep all cepstra, so it +# has the same info as filterbank features, but MFCC is more easily compressible +# (because less correlated) which is why we prefer this method. +--use-energy=false # use average of log energy, not energy. +--sample-frequency=8000 # Switchboard is sampled at 8kHz +--num-mel-bins=40 # similar to Google's setup. +--num-ceps=10 # for the higher-frequency-resolution mfcc coefficients, we'll use + # a larger window size of 25ms and the normal window. +--low-freq=40 # low cutoff frequency for mel bins +--high-freq=-200 # high cutoff frequently, relative to Nyquist of 4000 (=3800) +--frame-length=17 # shorter than normal (25ms) frame length.... the shortest we can + # go without the FFT becoming lower resolution which might cause + # problems +--window-type=hanning # additionally making the context shorter by using a more aggressively tapering window. +--frame-shift=5 # half the normal frame shift diff --git a/egs/swbd/s5c/conf/mfcc_hires_dbl.conf b/egs/swbd/s5c/conf/mfcc_hires_dbl.conf new file mode 100644 index 00000000000..c41b76116ee --- /dev/null +++ b/egs/swbd/s5c/conf/mfcc_hires_dbl.conf @@ -0,0 +1,12 @@ +# config for high-resolution MFCC features extracted at double the normal frame +# rate, intended for neural network training. Note: we keep all cepstra, so it +# has the same info as filterbank features, but MFCC is more easily compressible +# (because less correlated) which is why we prefer this method. +--use-energy=false # use average of log energy, not energy. +--sample-frequency=8000 # Switchboard is sampled at 8kHz +--num-mel-bins=40 # similar to Google's setup. +--num-ceps=40 # there is no dimensionality reduction. +--low-freq=40 # low cutoff frequency for mel bins +--high-freq=-200 # high cutoff frequently, relative to Nyquist of 4000 (=3800) +--frame-length=20 # slightly less than the normal 25ms frame length. +--frame-shift=5 # half the normal frame shift diff --git a/egs/swbd/s5c/conf/mfcc_hires_dbl2.conf b/egs/swbd/s5c/conf/mfcc_hires_dbl2.conf new file mode 100644 index 00000000000..92670e7ed6e --- /dev/null +++ b/egs/swbd/s5c/conf/mfcc_hires_dbl2.conf @@ -0,0 +1,11 @@ +# config for high-resolution MFCC features extracted at double the normal frame +# rate, intended for neural network training. Note: we keep all cepstra, so it +# has the same info as filterbank features, but MFCC is more easily compressible +# (because less correlated) which is why we prefer this method. +--use-energy=false # use average of log energy, not energy. +--sample-frequency=8000 # Switchboard is sampled at 8kHz +--num-mel-bins=40 # similar to Google's setup. +--num-ceps=40 # there is no dimensionality reduction. +--low-freq=40 # low cutoff frequency for mel bins +--high-freq=-200 # high cutoff frequently, relative to Nyquist of 4000 (=3800) +--frame-shift=5 # half the normal frame shift diff --git a/egs/swbd/s5c/conf/mfcc_hiresf.conf b/egs/swbd/s5c/conf/mfcc_hiresf.conf new file mode 100644 index 00000000000..c0b1798a9c5 --- /dev/null +++ b/egs/swbd/s5c/conf/mfcc_hiresf.conf @@ -0,0 +1,12 @@ +# this is a config for 'fast' (7.5ms frame shift) high-resolution MFCC features, +# intended for use with chain models. Note: we keep all cepstra, so it has the +# same info as filterbank features, but MFCC is more easily compressible +# (because less correlated) which is why we prefer this method. +--use-energy=false # use average of log energy, not energy. +--sample-frequency=8000 # Switchboard is sampled at 8kHz +--num-mel-bins=40 # similar to Google's setup. +--num-ceps=40 # there is no dimensionality reduction. +--low-freq=40 # low cutoff frequency for mel bins +--high-freq=-200 # high cutoff frequently, relative to Nyquist of 4000 (=3800) +--frame-length=25 # the normal frame length +--frame-shift=7.5 diff --git a/egs/swbd/s5c/local/chain/README.txt b/egs/swbd/s5c/local/chain/README.txt new file mode 100644 index 00000000000..8e347f4f889 --- /dev/null +++ b/egs/swbd/s5c/local/chain/README.txt @@ -0,0 +1,29 @@ + +there are a lot of tuning experiments here. + +ones to look at right now: + 2y is a TDNN baseline + 4f is a good jesus-layer system + 4q is an improved TDNN with various bells and whistles from Vijay. + 4r is a slightly-better jesus-layer system than 4f, with one more layer. + 5e is the best configuration run so far that doesn't have statistics-averaging layers. + 5g uses a statistics-averaging layer in the middle to slightly improve on 5e (by about + 0.2%). + 5j is a basic configuration without iVectors (about 2% abs worse than 5e) + 5k is the best configurations without iVectors... about 1% abs worse than 5e; we + use statistics-averaging layers to do some crude adaptation. + 5t gives about the same performance as 5e but is about 30% faster to train + and is smaller. + 5v is what I am currently using as a baseline- it has an even smaller + --jesus-hidden-dim as 5t (hence faster to train), but gives the same + performance. + 6g is a setup with a 'thinner' jesus-layer (with only one repeated-affine component) + and slightly more parameters, which is quicker to train than 5v but gives + about the same results. I'm hoping to use this setup, going forward. + 6i is like 6i but with a separate last-but-one affine layer for the xent output + (marginally better than 6g). + 6z is probably the thing I currently recommend to run-- it's a TDNN+ReLU based + setup that's quite fast to train and gives better results than our old + jesus-layer-based system. + + diff --git a/egs/swbd/s5c/local/chain/compare_wer.sh b/egs/swbd/s5c/local/chain/compare_wer.sh new file mode 100755 index 00000000000..ded03563711 --- /dev/null +++ b/egs/swbd/s5c/local/chain/compare_wer.sh @@ -0,0 +1,62 @@ +#!/bin/bash + + +echo -n "System " +for x in $*; do printf "% 10s" $x; done +echo + +echo -n "WER on train_dev(tg) " +for x in $*; do + wer=$(grep WER exp/chain/tdnn_${x}_sp/decode_train_dev_sw1_tg/wer_* | utils/best_wer.sh | awk '{print $2}') + printf "% 10s" $wer +done +echo + +echo -n "WER on train_dev(fg) " +for x in $*; do + wer=$(grep WER exp/chain/tdnn_${x}_sp/decode_train_dev_sw1_fsh_fg/wer_* | utils/best_wer.sh | awk '{print $2}') + printf "% 10s" $wer +done +echo + +echo -n "WER on eval2000(tg) " +for x in $*; do + wer=$(grep Sum exp/chain/tdnn_${x}_sp/decode_eval2000_sw1_tg/score*/*ys | grep -v swbd | utils/best_wer.sh | awk '{print $2}') + printf "% 10s" $wer +done +echo + +echo -n "WER on eval2000(fg) " +for x in $*; do + wer=$(grep Sum exp/chain/tdnn_${x}_sp/decode_eval2000_sw1_fsh_fg/score*/*ys | grep -v swbd | utils/best_wer.sh | awk '{print $2}') + printf "% 10s" $wer +done +echo + +echo -n "Final train prob " +for x in $*; do + prob=$(grep Overall exp/chain/tdnn_${x}_sp/log/compute_prob_train.final.log | grep -v xent | awk '{print $8}') + printf "% 10s" $prob +done +echo + +echo -n "Final valid prob " +for x in $*; do + prob=$(grep Overall exp/chain/tdnn_${x}_sp/log/compute_prob_valid.final.log | grep -v xent | awk '{print $8}') + printf "% 10s" $prob +done +echo + +echo -n "Final train prob (xent) " +for x in $*; do + prob=$(grep Overall exp/chain/tdnn_${x}_sp/log/compute_prob_train.final.log | grep -w xent | awk '{print $8}') + printf "% 10s" $prob +done +echo + +echo -n "Final valid prob (xent) " +for x in $*; do + prob=$(grep Overall exp/chain/tdnn_${x}_sp/log/compute_prob_valid.final.log | grep -w xent | awk '{print $8}') + printf "% 10s" $prob +done +echo diff --git a/egs/swbd/s5c/local/chain/run_blstm_6h.sh b/egs/swbd/s5c/local/chain/run_blstm_6h.sh new file mode 100755 index 00000000000..b19a0b489a0 --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_blstm_6h.sh @@ -0,0 +1,206 @@ +#!/bin/bash + +# based on run_tdnn_6h.sh + +#%WER 9.6 | 1831 21395 | 91.6 5.8 2.6 1.2 9.6 44.2 | exp/chain/blstm_6h_sp/decode_eval2000_sw1_fsh_fg/score_10_1.0/eval2000_hires.ctm.swbd.filt.sys +#%WER 14.5 | 4459 42989 | 87.4 8.9 3.7 1.9 14.5 50.5 | exp/chain/blstm_6h_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys +#%WER 19.3 | 2628 21594 | 83.3 11.8 4.9 2.5 19.3 54.8 | exp/chain/blstm_6h_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.callhm.filt.sys +#%WER 13.32 [ 6554 / 49204, 830 ins, 1696 del, 4028 sub ] exp/chain/blstm_6h_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/blstm_6h # Note: _sp will get added to this if $speed_perturb == true. +decode_iter= +decode_dir_affix= + +# training options +leftmost_questions_truncate=-1 +chunk_width=150 +chunk_left_context=40 +chunk_right_context=40 +xent_regularize=0.025 + +label_delay=0 +# decode options +extra_left_context= +extra_right_context= +frames_per_chunk= + +remove_egs=false +common_egs_dir= + +affix= +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + echo "$0: creating neural net configs"; + + steps/nnet3/lstm/make_configs.py \ + --feat-dir data/${train_set}_hires \ + --ivector-dir exp/nnet3/ivectors_${train_set} \ + --tree-dir $treedir \ + --splice-indexes="-2,-1,0,1,2 0 0" \ + --lstm-delay=" [-3,3] [-3,3] [-3,3] " \ + --xent-regularize $xent_regularize \ + --include-log-softmax false \ + --num-lstm-layers 3 \ + --cell-dim 1024 \ + --hidden-dim 1024 \ + --recurrent-projection-dim 256 \ + --non-recurrent-projection-dim 256 \ + --label-delay $label_delay \ + --self-repair-scale 0.00001 \ + $dir/configs || exit 1; + +fi + +if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --chain.left-deriv-truncate 0 \ + --trainer.num-chunk-per-minibatch 64 \ + --trainer.frames-per-iter 1200000 \ + --trainer.max-param-change 2.0 \ + --trainer.num-epochs 4 \ + --trainer.optimization.shrink-value 0.99 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.optimization.momentum 0.0 \ + --egs.stage $get_egs_stage \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width $chunk_width \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --egs.dir "$common_egs_dir" \ + --cleanup.remove-egs $remove_egs \ + --feat-dir data/${train_set}_hires \ + --tree-dir $treedir \ + --lat-dir exp/tri4_lats_nodup$suffix \ + --dir $dir || exit 1; +fi + +if [ $stage -le 14 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 15 ]; then + [ -z $extra_left_context ] && extra_left_context=$chunk_left_context; + [ -z $extra_right_context ] && extra_right_context=$chunk_right_context; + [ -z $frames_per_chunk ] && frames_per_chunk=$chunk_width; + iter_opts= + if [ ! -z $decode_iter ]; then + iter_opts=" --iter $decode_iter " + fi + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj 250 --cmd "$decode_cmd" $iter_opts \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --frames-per-chunk "$frames_per_chunk" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_dir_affix:+_$decode_dir_affix}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}${decode_dir_affix:+_$decode_dir_affix}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_blstm_6h_discriminative.sh b/egs/swbd/s5c/local/chain/run_blstm_6h_discriminative.sh new file mode 100755 index 00000000000..b0264c17d8b --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_blstm_6h_discriminative.sh @@ -0,0 +1,238 @@ +#!/bin/bash + +set -o pipefail +set -e +# this is run_discriminative.sh + +# This script does discriminative training on top of chain nnet3 system. +# note: this relies on having a cluster that has plenty of CPUs as well as GPUs, +# since the lattice generation runs in about real-time, so takes of the order of +# 1000 hours of CPU time. +# +. cmd.sh + + +stage=0 +train_stage=-10 # can be used to start training in the middle. +get_egs_stage=-10 +use_gpu=true # for training +cleanup=false # run with --cleanup true --stage 6 to clean up (remove large things like denlats, + # alignments and degs). + +# Frame chunk options that will be used for blstm models. +frames_per_chunk=150 +extra_left_context=40 +extra_right_context=40 +extra_left_context_initial=-1 +extra_right_context_final=-1 + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +srcdir=exp/chain/blstm_6h_sp +train_data_dir=data/train_nodup_sp_hires +online_ivector_dir=exp/nnet3/ivectors_train_nodup_sp +degs_dir= # If provided, will skip the degs directory creation +lats_dir= # If provided, will skip denlats creation + +## Objective options +criterion=smbr +one_silence_class=true + +dir=${srcdir}_${criterion} + +## Egs options +frames_per_eg=150 +frames_overlap_per_eg=30 +truncate_deriv_weights=10 + +## Nnet training options +effective_learning_rate=0.000000125 +max_param_change=1 +num_jobs_nnet=4 +num_epochs=4 +regularization_opts="--xent-regularize=0.1 --l2-regularize=0.00005" # Applicable for providing --xent-regularize and --l2-regularize options +minibatch_size=64 + +## Decode options +decode_start_epoch=1 # can be used to avoid decoding all epochs, e.g. if we decided to run more. + +if $use_gpu; then + if ! cuda-compiled; then + cat </dev/null || true + + data_dirs= + for x in `seq -$[frame_subsampling_factor/2] $[frame_subsampling_factor/2]`; do + steps/shift_feats.sh --cmd "$train_cmd --max-jobs-run 40" --nj 350 \ + $x $train_data_dir exp/shift_hires/ mfcc_hires + utils/fix_data_dir.sh ${train_data_dir}_fs$x + data_dirs="$data_dirs ${train_data_dir}_fs$x" + awk -v nfs=$x '{print "fs"nfs"-"$0}' $online_ivector_dir/ivector_online.scp >> ${online_ivector_dir}_fs/ivector_online.scp + done + utils/combine_data.sh ${train_data_dir}_fs $data_dirs + for x in `seq -$[frame_subsampling_factor/2] $[frame_subsampling_factor/2]`; do + rm -r ${train_data_dir}_fs$x + done + fi + + train_data_dir=${train_data_dir}_fs + + affix=_fs +fi + +rm ${online_ivector_dir}_fs/ivector_online.scp 2>/dev/null || true +for x in `seq -$[frame_subsampling_factor/2] $[frame_subsampling_factor/2]`; do + awk -v nfs=$x '{print "fs"nfs"-"$0}' $online_ivector_dir/ivector_online.scp >> ${online_ivector_dir}_fs/ivector_online.scp +done +online_ivector_dir=${online_ivector_dir}_fs + +if [ $stage -le 1 ]; then + # hardcode no-GPU for alignment, although you could use GPU [you wouldn't + # get excellent GPU utilization though.] + nj=350 # have a high number of jobs because this could take a while, and we might + # have some stragglers. + steps/nnet3/align.sh --cmd "$decode_cmd" --use-gpu false \ + --online-ivector-dir $online_ivector_dir $context_opts \ + --scale-opts "--transition-scale=1.0 --acoustic-scale=1.0 --self-loop-scale=1.0" \ + --nj $nj $train_data_dir $lang $srcdir ${srcdir}_ali${affix} ; +fi + +if [ -z "$lats_dir" ]; then + lats_dir=${srcdir}_denlats${affix} + if [ $stage -le 2 ]; then + nj=50 + # this doesn't really affect anything strongly, except the num-jobs for one of + # the phases of get_egs_discriminative.sh below. + num_threads_denlats=6 + subsplit=40 # number of jobs that run per job (but 2 run at a time, so total jobs is 80, giving + # total slots = 80 * 6 = 480. + steps/nnet3/make_denlats.sh --cmd "$decode_cmd" \ + --self-loop-scale 1.0 --acwt 1.0 --determinize true \ + --online-ivector-dir $online_ivector_dir $context_opts \ + --nj $nj --sub-split $subsplit --num-threads "$num_threads_denlats" --config conf/decode.config \ + $train_data_dir $lang $srcdir ${lats_dir} ; + fi +fi + +model_left_context=`nnet3-am-info $srcdir/final.mdl | grep "left-context:" | awk '{print $2}'` +model_right_context=`nnet3-am-info $srcdir/final.mdl | grep "right-context:" | awk '{print $2}'` + +left_context=$[model_left_context + extra_left_context] +right_context=$[model_right_context + extra_right_context] + +valid_left_context=$[valid_left_context + frames_per_eg] +valid_right_context=$[valid_right_context + frames_per_eg] + +cmvn_opts=`cat $srcdir/cmvn_opts` + +if [ -z "$degs_dir" ]; then + degs_dir=${srcdir}_degs${affix} + + if [ $stage -le 3 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d ${srcdir}_degs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{1,2,12,13}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5/${srcdir}_degs/storage ${srcdir}_degs/storage + fi + # have a higher maximum num-jobs if + if [ -d ${srcdir}_degs/storage ]; then max_jobs=10; else max_jobs=5; fi + + degs_opts="--determinize true --minimize true --remove-output-symbols true --remove-epsilons true --collapse-transition-ids true" + + steps/nnet3/get_egs_discriminative.sh \ + --cmd "$decode_cmd --max-jobs-run $max_jobs --mem 20G" --stage $get_egs_stage --cmvn-opts "$cmvn_opts" \ + --adjust-priors false --acwt 1.0 \ + --online-ivector-dir $online_ivector_dir \ + --left-context $left_context --right-context $right_context \ + --valid-left-context $valid_left_context --valid-right-context $valid_right_context \ + --priors-left-context $valid_left_context --priors-right-context $valid_right_context $frame_subsampling_opt \ + --frames-per-eg $frames_per_eg --frames-overlap-per-eg $frames_overlap_per_eg ${degs_opts} \ + $train_data_dir $lang ${srcdir}_ali${affix} $lats_dir $srcdir/final.mdl $degs_dir ; + fi +fi + +if [ $stage -le 4 ]; then + steps/nnet3/train_discriminative.sh --cmd "$decode_cmd" \ + --stage $train_stage \ + --effective-lrate $effective_learning_rate --max-param-change $max_param_change \ + --criterion $criterion --drop-frames true --acoustic-scale 1.0 \ + --num-epochs $num_epochs --one-silence-class $one_silence_class --minibatch-size $minibatch_size \ + --num-jobs-nnet $num_jobs_nnet --num-threads $num_threads \ + --regularization-opts "$regularization_opts" --use-frame-shift false \ + --truncate-deriv-weights $truncate_deriv_weights --adjust-priors false \ + --modify-learning-rates false \ + ${degs_dir} $dir ; +fi + +graph_dir=$srcdir/graph_sw1_tg +if [ $stage -le 5 ]; then + for x in `seq $decode_start_epoch $num_epochs`; do + for decode_set in train_dev eval2000 rt03; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + iter=epoch$x.adj + + steps/nnet3/decode.sh --nj $num_jobs --cmd "$decode_cmd" --iter $iter \ + --acwt 1.0 --post-decode-acwt 10.0 \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} $context_opts \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_sw1_tg_$iter ; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg}_$iter ; + fi + ) & + done + done +fi +wait; + +if [ $stage -le 6 ] && $cleanup; then + # if you run with "--cleanup true --stage 6" you can clean up. + rm ${lats_dir}/lat.*.gz || true + rm ${srcdir}_ali/ali.*.gz || true + steps/nnet2/remove_egs.sh ${srcdir}_degs || true +fi + + +exit 0; + diff --git a/egs/swbd/s5c/local/chain/run_blstm_d.sh b/egs/swbd/s5c/local/chain/run_blstm_d.sh new file mode 100755 index 00000000000..74cea0c28ab --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_blstm_d.sh @@ -0,0 +1,216 @@ +#!/bin/bash + +# based on run_tdnn_2o.sh + +set -e + +# configs for 'chain' +stage=10 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/lstm_d # Note: _sp will get added to this if $speed_perturb == true. +decode_iter= +decode_dir_affix= + +# LSTM options +splice_indexes="-2,-1,0,1,2 0 0" +lstm_delay=" [-3,3] [-3,3] [-3,3] " +label_delay=0 +num_lstm_layers=3 +cell_dim=1024 +hidden_dim=1024 +recurrent_projection_dim=128 +non_recurrent_projection_dim=128 + +# training options +leftmost_questions_truncate=-1 +chunk_width=150 +chunk_left_context=40 +chunk_right_context=40 +xent_regularize=0.025 + +# decode options +extra_left_context= +extra_right_context= +frames_per_chunk= + +remove_egs=false +common_egs_dir= + +affix= +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + echo "$0: creating neural net configs"; + + # create the config files for nnet initialization + # note an additional space is added to splice_indexes to + # avoid issues with the python ArgParser which can have + # issues with negative arguments (due to minus sign) + config_extra_opts=() + [ ! -z "$lstm_delay" ] && config_extra_opts+=(--lstm-delay "$lstm_delay") + + steps/nnet3/lstm/make_configs.py "${config_extra_opts[@]}" \ + --feat-dir data/${train_set}_hires \ + --ivector-dir exp/nnet3/ivectors_${train_set} \ + --tree-dir $treedir \ + --xent-regularize $xent_regularize \ + --include-log-softmax false \ + --splice-indexes "$splice_indexes " \ + --num-lstm-layers $num_lstm_layers \ + --cell-dim $cell_dim \ + --hidden-dim $hidden_dim \ + --recurrent-projection-dim $recurrent_projection_dim \ + --non-recurrent-projection-dim $non_recurrent_projection_dim \ + --label-delay $label_delay \ + --self-repair-scale 0.00001 \ + $dir/configs || exit 1; + +fi + +if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train.py --stage $train_stage \ + --stage=$train_stage \ + --cmd="$decode_cmd" \ + --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --feat.cmvn-opts="--norm-means=false --norm-vars=false" \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.xent-regularize $xent_regularize \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --chain.left-deriv-truncate 0 \ + --trainer.num-chunk-per-minibatch 64 \ + --trainer.max-param-change 2.0 \ + --trainer.num-epochs 4 \ + --trainer.optimization.shrink-value 0.99 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.optimization.momentum 0.9 \ + --egs.stage $get_egs_stage \ + --egs.opts="--frames-overlap-per-eg 0" \ + --egs.chunk-width $chunk_width \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --egs.dir "$common_egs_dir" \ + --cleanup.remove-egs $remove_egs \ + --feat-dir data/${train_set}_hires \ + --tree-dir $treedir \ + --lat-dir exp/tri4_lats_nodup$suffix \ + --dir $dir || exit 1; +fi + +if [ $stage -le 14 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 15 ]; then + [ -z $extra_left_context ] && extra_left_context=$chunk_left_context; + [ -z $extra_right_context ] && extra_right_context=$chunk_right_context; + [ -z $frames_per_chunk ] && frames_per_chunk=$chunk_width; + iter_opts= + if [ ! -z $decode_iter ]; then + iter_opts=" --iter $decode_iter " + fi + for decode_set in train_dev eval2000; do + ( + steps/nnet3/lstm/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj 250 --cmd "$decode_cmd" $iter_opts \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --frames-per-chunk "$frames_per_chunk" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_dir_affix:+_$decode_dir_affix}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}${decode_dir_affix:+_$decode_dir_affix}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_lstm_6h.sh b/egs/swbd/s5c/local/chain/run_lstm_6h.sh new file mode 100755 index 00000000000..0e777d85fac --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_lstm_6h.sh @@ -0,0 +1,211 @@ +#!/bin/bash + +# based on run_tdnn_6h.sh + +# %WER 15.6 | 4459 42989 | 86.1 9.2 4.7 1.8 15.6 52.1 | exp/chain/lstm_6h_ld5_sp/decode_eval2000_sw1_fsh_fg/score_10_1.0/eval2000_hires.ctm.filt.sys +# %WER 10.3 | 1831 21395 | 90.9 6.1 3.0 1.3 10.3 44.7 | exp/chain/lstm_6h_ld5_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.swbd.filt.sys +# %WER 20.7 | 2628 21594 | 82.0 12.8 5.3 2.7 20.7 56.7 | exp/chain/lstm_6h_ld5_sp/decode_eval2000_sw1_fsh_fg/score_8_0.0/eval2000_hires.ctm.callhm.filt.sys + +# if right-tolerance was 10 (these are old results) +#--------------------------- +# %WER 15.8 | 4459 42989 | 86.0 9.3 4.8 1.8 15.8 52.0 | exp/chain/lstm_6h_ld5_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys +# %WER 10.6 | 1831 21395 | 90.6 6.2 3.2 1.2 10.6 45.2 | exp/chain/lstm_6h_ld5_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.swbd.filt.sys +# %WER 21.0 | 2628 21594 | 81.4 12.4 6.3 2.4 21.0 56.8 | exp/chain/lstm_6h_ld5_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.callhm.filt.sys + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/lstm_6h2 # Note: _sp will get added to this if $speed_perturb == true. +decode_iter= +decode_dir_affix= + +# training options +leftmost_questions_truncate=-1 +chunk_width=150 +chunk_left_context=40 +chunk_right_context=0 +xent_regularize=0.025 + +label_delay=5 +# decode options +extra_left_context= +extra_right_context= +frames_per_chunk= + +remove_egs=false +common_egs_dir= + +affix= +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + echo "$0: creating neural net configs"; + + steps/nnet3/lstm/make_configs.py \ + --feat-dir data/${train_set}_hires \ + --ivector-dir exp/nnet3/ivectors_${train_set} \ + --tree-dir $treedir \ + --splice-indexes="-2,-1,0,1,2 0 0" \ + --lstm-delay=" -3 -3 -3 " \ + --xent-regularize $xent_regularize \ + --include-log-softmax false \ + --num-lstm-layers 3 \ + --cell-dim 1024 \ + --hidden-dim 1024 \ + --recurrent-projection-dim 256 \ + --non-recurrent-projection-dim 256 \ + --label-delay $label_delay \ + --self-repair-scale 0.00001 \ + $dir/configs || exit 1; + +fi + +if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --chain.left-deriv-truncate 0 \ + --trainer.num-chunk-per-minibatch 64 \ + --trainer.frames-per-iter 1200000 \ + --trainer.max-param-change 2.0 \ + --trainer.num-epochs 4 \ + --trainer.optimization.shrink-value 0.99 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.optimization.momentum 0.0 \ + --egs.stage $get_egs_stage \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width $chunk_width \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --egs.dir "$common_egs_dir" \ + --cleanup.remove-egs $remove_egs \ + --feat-dir data/${train_set}_hires \ + --tree-dir $treedir \ + --lat-dir exp/tri4_lats_nodup$suffix \ + --dir $dir || exit 1; +fi + +if [ $stage -le 14 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 15 ]; then + [ -z $extra_left_context ] && extra_left_context=$chunk_left_context; + [ -z $extra_right_context ] && extra_right_context=$chunk_right_context; + [ -z $frames_per_chunk ] && frames_per_chunk=$chunk_width; + iter_opts= + if [ ! -z $decode_iter ]; then + iter_opts=" --iter $decode_iter " + fi + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj 250 --cmd "$decode_cmd" $iter_opts \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --frames-per-chunk "$frames_per_chunk" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_dir_affix:+_$decode_dir_affix}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}${decode_dir_affix:+_$decode_dir_affix}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_lstm_d.sh b/egs/swbd/s5c/local/chain/run_lstm_d.sh new file mode 100755 index 00000000000..05db63c2bee --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_lstm_d.sh @@ -0,0 +1,216 @@ +#!/bin/bash + +# based on run_tdnn_2o.sh + +set -e + +# configs for 'chain' +stage=10 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/lstm_d # Note: _sp will get added to this if $speed_perturb == true. +decode_iter= +decode_dir_affix= + +# LSTM options +splice_indexes="-2,-1,0,1,2 0 0" +lstm_delay=" -3 -3 -3 " +label_delay=5 +num_lstm_layers=3 +cell_dim=1024 +hidden_dim=1024 +recurrent_projection_dim=256 +non_recurrent_projection_dim=256 + +# training options +leftmost_questions_truncate=-1 +chunk_width=150 +chunk_left_context=40 +chunk_right_context=0 +xent_regularize=0.025 + +# decode options +extra_left_context= +extra_right_context= +frames_per_chunk= + +remove_egs=false +common_egs_dir= + +affix= +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + echo "$0: creating neural net configs"; + + # create the config files for nnet initialization + # note an additional space is added to splice_indexes to + # avoid issues with the python ArgParser which can have + # issues with negative arguments (due to minus sign) + config_extra_opts=() + [ ! -z "$lstm_delay" ] && config_extra_opts+=(--lstm-delay "$lstm_delay") + + steps/nnet3/lstm/make_configs.py "${config_extra_opts[@]}" \ + --feat-dir data/${train_set}_hires \ + --ivector-dir exp/nnet3/ivectors_${train_set} \ + --tree-dir $treedir \ + --xent-regularize $xent_regularize \ + --include-log-softmax false \ + --splice-indexes "$splice_indexes " \ + --num-lstm-layers $num_lstm_layers \ + --cell-dim $cell_dim \ + --hidden-dim $hidden_dim \ + --recurrent-projection-dim $recurrent_projection_dim \ + --non-recurrent-projection-dim $non_recurrent_projection_dim \ + --label-delay $label_delay \ + --self-repair-scale 0.00001 \ + $dir/configs || exit 1; + +fi + +if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train.py --stage $train_stage \ + --stage=$train_stage \ + --cmd="$decode_cmd" \ + --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --feat.cmvn-opts="--norm-means=false --norm-vars=false" \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00001 \ + --chain.xent-regularize $xent_regularize \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --chain.left-deriv-truncate 0 \ + --trainer.num-chunk-per-minibatch 64 \ + --trainer.max-param-change 2.0 \ + --trainer.num-epochs 4 \ + --trainer.optimization.shrink-value 0.99 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.optimization.momentum 0.0 \ + --egs.stage $get_egs_stage \ + --egs.opts="--frames-overlap-per-eg 0" \ + --egs.chunk-width $chunk_width \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --egs.dir "$common_egs_dir" \ + --cleanup.remove-egs $remove_egs \ + --feat-dir data/${train_set}_hires \ + --tree-dir $treedir \ + --lat-dir exp/tri4_lats_nodup$suffix \ + --dir $dir || exit 1; +fi + +if [ $stage -le 14 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 15 ]; then + [ -z $extra_left_context ] && extra_left_context=$chunk_left_context; + [ -z $extra_right_context ] && extra_right_context=$chunk_right_context; + [ -z $frames_per_chunk ] && frames_per_chunk=$chunk_width; + iter_opts= + if [ ! -z $decode_iter ]; then + iter_opts=" --iter $decode_iter " + fi + for decode_set in train_dev eval2000; do + ( + steps/nnet3/lstm/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj 250 --cmd "$decode_cmd" $iter_opts \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --frames-per-chunk "$frames_per_chunk" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_dir_affix:+_$decode_dir_affix}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}${decode_dir_affix:+_$decode_dir_affix}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_2a.sh b/egs/swbd/s5c/local/chain/run_tdnn_2a.sh new file mode 100755 index 00000000000..98d9130989a --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_2a.sh @@ -0,0 +1,245 @@ +#!/bin/bash + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". +# Note, this leads to a cutoff of zero, so it's the same as infinite --num-extra-states. +# The table below compares a sequence of experiments {x,s,w,z,2a} where only +# the --num-extra-states is varied. + +# I'm also adding to this table some other experiments: 2d, which also had +# --num-extra-states=2000 --ngram-order=4 --leftmost-context-questions=/dev/null [so +# there was no concept of sets of phones for the 3-gram, plus we could go to 4-gram]. +# [note that the actual baseline for 2d was 2c, which was as 2a but with +# a code change RE transition-scale, but that made no consistent difference, so +# acting as if that was a no-op.] +# +# +# Comparing the --num-extra-states: +# +# --num-extra-states: 0 200 500 2000 8000 *these all had the default --leftmost-context-questions, splitting to ~23 sets.] +# --num-extra-states: 2000 *plus: --ngram-order=4 --leftmost-context-questions=/dev/null [so 3gram and 4gram all in one set, and 4gram allowed.] +# new code, --num-lm-states,--ngram-order: 10k,5 7k,5 5k,4 (this pruned on state count and only left bigrams unpruned) +# newer code, --num-extra-lm-states (note, ngram-order=5,no-prune-order=3) 2000 1000 (prune on perplexity, no-prune default=3gram). +# experiment: x s w z 2a | 2d | 2f 2g 2h | 2i 2j +# WER (train_dev,tg) 18.67 18.45 *18.02 18.06 18.20 |*17.55 | 17.49*17.28 17.46 |*17.44 17.54 +# WER (train_dev,fg) 17.22 16.96 16.70 *16.46 16.59 |*16.14 | 16.21 16.14 *16.08 |*16.09 16.20 +# WER (eval2000,tg) 20.4 20.1 19.9 *19.7 19.8 |*19.5 | 19.6 *19.4 19.5 |*19.2 *19.2 +# WER (eval2000,fg) 18.4 18.0 17.9 18.0 *17.7 |*17.6 | 17.8 17.7 *17.6 | 17.3 *17.2 +# #states in den.fst 29384 30064 30744 31487 31729 | 37451 | 48591 42804 38818 | 35460 33272 +# #arcs in den.fst 249524 252690 255242 251118 238678| 342831|618289 515353 428241 | 299068 267092 +# LM perplexity 8.78 8.07 7.76 7.39 7.37 | 6.34 | 5.75 6.04 6.27 | 6.07 6.35 +# # phone-lm states 2644 2864 3092 4321 6438 | 7437 | 10000 7000 5000 | 8437 7437 +# # phone-lm arcs 44581 50007 54167 68044 73839 | 118699|192690 146938 110505 | 100969 88520 + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_2a # Note: _sp will get added to this if $speed_perturb == true. + +# TDNN options +splice_indexes="-2,-1,0,1,2 -1,2 -3,3 -6,3 -6,3" + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=30 +max_param_change=1.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set data/lang_chain_d $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{1,2,3,4}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --pdf-boundary-penalty 0.0 \ + --lm-opts "--num-extra-states=8000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 30" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --splice-indexes "$splice_indexes" \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --final-layer-normalize-target $final_layer_normalize_target \ + --relu-dim 850 \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --transition-scale 0.0 \ + --self-loop-scale 0.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_2b.sh b/egs/swbd/s5c/local/chain/run_tdnn_2b.sh new file mode 100755 index 00000000000..0515f73b434 --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_2b.sh @@ -0,0 +1,255 @@ +#!/bin/bash + +# _2b is as _y but --frames-overlap-per-eg 75 (was 30 before). This is not very +# efficient in terms of disk space but I want to see the effect on results. + +# In terms of the objf, the training is a lot better, -0.0879->-0.0779, and validation is +# slightly better: -0.126 -> -0.123. +# But the WERs are 0.3 worse across the board: on train_dev, with tg 18.04->18.15, with fg +# 16.57->16.83; on all of eval2000, with tg 13.2->13.7, and with fg 11.7->12.0. +# I'm a little at a loss how to interpret these. +# Note: I decode an earlier iter (300) but the results were not much better: final->300, +# 13.7->13.7 on all of eval2000 with tg, and 18.15->18.10 on all of train_dev with tg. + +# _y is as _s but trying --apply-deriv-weights false. (note: in the +# interim, the script was changed so the train and valid probs have --pdf-boundary-penalty 0 +# and are no longer comparable with the ones in _s. +# +# Compared to s, the results are improved: on train_dev, 18.45->18.04 with tg +# and 16.96->16.57 with fg; on all of eval2000, 20.1->19.8 with tg and 18.0 to +# 17.9 with fg. +# +# +# I recomputed the train and valid probs using the .486 model and no --pdf-boundary-penalty option, to +# be able to compre with the _s ones. In _s the (train,valid) probs at iter 485 were (-0.0691, -0.0997), +# in _y the (train,valid) probs at iter 486 were (-0.0655,-0.0998). So better on train, essentially +# the same on valid. It makes sense it would be better on train, since its overtraining is more +# closely aligned with the distribution of training segments on which we compute the objf-- also because +# we've simply trained more, i.e. equivalent to slightly more epochs. + + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_2b # Note: _sp will get added to this if $speed_perturb == true. + +# TDNN options +splice_indexes="-2,-1,0,1,2 -1,2 -3,3 -6,3 -6,3" + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=30 +max_param_change=1.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set data/lang_chain_d $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --apply-deriv-weights false \ + --pdf-boundary-penalty 0.0 \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 75" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --splice-indexes "$splice_indexes" \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --final-layer-normalize-target $final_layer_normalize_target \ + --relu-dim 850 \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --transition-scale 0.0 \ + --self-loop-scale 0.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; + +if [ $stage -le 15 ]; then + for decode_set in train_dev eval2000; do + ( + iter=300 + steps/nnet3/decode.sh --iter $iter --acwt 1.0 --post-decode-acwt 10.0 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff}_it$iter || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg}_it$iter || exit 1; + fi + ) & + done +fi + +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_2c.sh b/egs/swbd/s5c/local/chain/run_tdnn_2c.sh new file mode 100755 index 00000000000..ffd2044c272 --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_2c.sh @@ -0,0 +1,226 @@ +#!/bin/bash + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# As expected the results are consistent with randomness: 2a->2c, on all of eval2000, +# before rescoring 19.8->19.8 and after rescoring 17.7->17.8; on train_dev, +# before rescoring 18.20->18.12, and after rescoring 16.59->16.73. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_2c # Note: _sp will get added to this if $speed_perturb == true. + +# TDNN options +splice_indexes="-2,-1,0,1,2 -1,2 -3,3 -6,3 -6,3" + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=30 +max_param_change=1.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set data/lang_chain_d $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --pdf-boundary-penalty 0.0 \ + --lm-opts "--num-extra-states=8000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 30" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --splice-indexes "$splice_indexes" \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --final-layer-normalize-target $final_layer_normalize_target \ + --relu-dim 850 \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_2d.sh b/egs/swbd/s5c/local/chain/run_tdnn_2d.sh new file mode 100755 index 00000000000..c93121499cd --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_2d.sh @@ -0,0 +1,231 @@ +#!/bin/bash + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_2d # Note: _sp will get added to this if $speed_perturb == true. + +# TDNN options +splice_indexes="-2,-1,0,1,2 -1,2 -3,3 -6,3 -6,3" + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=30 +max_param_change=1.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set data/lang_chain_d $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --pdf-boundary-penalty 0.0 \ + --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 30" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --splice-indexes "$splice_indexes" \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --final-layer-normalize-target $final_layer_normalize_target \ + --relu-dim 850 \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_2e.sh b/egs/swbd/s5c/local/chain/run_tdnn_2e.sh new file mode 100755 index 00000000000..a8552244ed2 --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_2e.sh @@ -0,0 +1,279 @@ +#!/bin/bash + +# _2e is as _2b, but --frames-overlap-per-eg 0 (also compare with _y, which has +# an overlap of 30; _2b has 75). BUT we also made a code change as in 2a->2c, where we use +# transition-scale and self-loop-scale of 1, so we are making the same change in +# 2b->2e; it requires a script change too, to match. we'll have to correct the +# results for this. (note: this won't matter as the results did not change) +# +# Comparing results: +# expt: _2b _y _2e _s +# --frames-overlap-per-eg 75 30 0 30 +# --apply-deriv-weights f f f t +# all of eval2000 (tg) 20.1 19.8 19.7 20.1 +# all of eval2000 (fg) 18.0 17.9 17.8 18.0 +# train_dev (tg) 18.15 18.04 17.85 18.45 +# train_dev (fg) 16.83 16.57 16.52 16.96 +# ... on all of these tests, results are consistently better towards smaller +# --frames-overlap-per-eg. and apply-deriv-weights=f seems better. +# + + +# _2b is as _y but --frames-overlap-per-eg 75 (was 30 before). This is not very +# efficient in terms of disk space but I want to see the effect on results. + +# In terms of the objf, the training is a lot better, -0.0879->-0.0779, and validation is +# slightly better: -0.126 -> -0.123. +# But the WERs are 0.3 worse across the board: on train_dev, with tg 18.04->18.15, with fg +# 16.57->16.83; on all of eval2000, with tg 13.2->13.7, and with fg 11.7->12.0. +# I'm a little at a loss how to interpret these. +# Note: I decode an earlier iter (300) but the results were not much better: final->300, +# 13.7->13.7 on all of eval2000 with tg, and 18.15->18.10 on all of train_dev with tg. + +# _y is as _s but trying --apply-deriv-weights false. (note: in the +# interim, the script was changed so the train and valid probs have --pdf-boundary-penalty 0 +# and are no longer comparable with the ones in _s. +# +# Compared to s, the results are improved: on train_dev, 18.45->18.04 with tg +# and 16.96->16.57 with fg; on all of eval2000, 20.1->19.8 with tg and 18.0 to +# 17.9 with fg. +# +# +# I recomputed the train and valid probs using the .486 model and no --pdf-boundary-penalty option, to +# be able to compre with the _s ones. In _s the (train,valid) probs at iter 485 were (-0.0691, -0.0997), +# in _y the (train,valid) probs at iter 486 were (-0.0655,-0.0998). So better on train, essentially +# the same on valid. It makes sense it would be better on train, since its overtraining is more +# closely aligned with the distribution of training segments on which we compute the objf-- also because +# we've simply trained more, i.e. equivalent to slightly more epochs. + + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_2e # Note: _sp will get added to this if $speed_perturb == true. + +# TDNN options +splice_indexes="-2,-1,0,1,2 -1,2 -3,3 -6,3 -6,3" + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=30 +max_param_change=1.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set data/lang_chain_d $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --apply-deriv-weights false \ + --pdf-boundary-penalty 0.0 \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --splice-indexes "$splice_indexes" \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --final-layer-normalize-target $final_layer_normalize_target \ + --relu-dim 850 \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; + +exit 0; + + +# BROKEN results where I had overlap of 75, so it was mostly just a repetition of _2b, except with +# that 2a->2c change. + +b01:s5c: for l in y 2b 2e; do grep Sum exp/chain/tdnn_${l}_sp/decode_eval2000_sw1_tg/score*/*ys | utils/best_wer.sh ; done +%WER 13.2 | 1831 21395 | 88.4 8.0 3.6 1.6 13.2 50.6 | exp/chain/tdnn_y_sp/decode_eval2000_sw1_tg/score_12_0.0/eval2000_hires.ctm.swbd.filt.sys +%WER 13.7 | 1831 21395 | 88.1 8.2 3.7 1.8 13.7 51.0 | exp/chain/tdnn_2b_sp/decode_eval2000_sw1_tg/score_12_0.0/eval2000_hires.ctm.swbd.filt.sys +%WER 13.4 | 1831 21395 | 88.4 8.2 3.4 1.8 13.4 50.8 | exp/chain/tdnn_2e_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.swbd.filt.sys +b01:s5c: for l in y 2b 2e; do grep Sum exp/chain/tdnn_${l}_sp/decode_eval2000_sw1_fsh_fg/score*/*ys | utils/best_wer.sh ; done +On iteration 368, learning rate is 0.00304840891076219. +Training neural net (pass 368) +%WER 11.7 | 1831 21395 | 89.7 7.0 3.2 1.4 11.7 47.8 | exp/chain/tdnn_y_sp/decode_eval2000_sw1_fsh_fg/score_12_0.0/eval2000_hires.ctm.swbd.filt.sys +%WER 12.0 | 1831 21395 | 89.5 7.1 3.4 1.5 12.0 49.4 | exp/chain/tdnn_2b_sp/decode_eval2000_sw1_fsh_fg/score_12_0.0/eval2000_hires.ctm.swbd.filt.sys +%WER 12.1 | 1831 21395 | 89.4 7.5 3.1 1.5 12.1 48.4 | exp/chain/tdnn_2e_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.swbd.filt.sys +b01:s5c: +b01:s5c: for l in y 2b 2e; do grep WER exp/chain/tdnn_${l}_sp/decode_train_dev_sw1_tg/wer_* | utils/best_wer.sh ; done +%WER 18.04 [ 8877 / 49204, 1125 ins, 2296 del, 5456 sub ] exp/chain/tdnn_y_sp/decode_train_dev_sw1_tg/wer_12_0.0 +%WER 18.15 [ 8930 / 49204, 1121 ins, 2244 del, 5565 sub ] exp/chain/tdnn_2b_sp/decode_train_dev_sw1_tg/wer_12_0.0 +%WER 18.24 [ 8975 / 49204, 1242 ins, 2064 del, 5669 sub ] exp/chain/tdnn_2e_sp/decode_train_dev_sw1_tg/wer_11_0.0 +b01:s5c: for l in y 2b 2e; do grep WER exp/chain/tdnn_${l}_sp/decode_train_dev_sw1_fsh_fg/wer_* | utils/best_wer.sh ; done +%WER 16.57 [ 8155 / 49204, 1144 ins, 1988 del, 5023 sub ] exp/chain/tdnn_y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +%WER 16.83 [ 8282 / 49204, 1106 ins, 2115 del, 5061 sub ] exp/chain/tdnn_2b_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0 +%WER 16.79 [ 8260 / 49204, 1090 ins, 2138 del, 5032 sub ] exp/chain/tdnn_2e_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0 diff --git a/egs/swbd/s5c/local/chain/run_tdnn_2f.sh b/egs/swbd/s5c/local/chain/run_tdnn_2f.sh new file mode 100755 index 00000000000..86c23acc90c --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_2f.sh @@ -0,0 +1,236 @@ +#!/bin/bash + +# _2f is as _2d but following a code change, and with different LM options: +# --ngram-order=5 --num-lm-states=10000 +# Now the extra questions are not needed. +# see table in run_tdnn_2a.sh for results + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_2f # Note: _sp will get added to this if $speed_perturb == true. + +# TDNN options +splice_indexes="-2,-1,0,1,2 -1,2 -3,3 -6,3 -6,3" + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=30 +max_param_change=1.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set data/lang_chain_d $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --pdf-boundary-penalty 0.0 \ + --lm-opts "--ngram-order=5 --num-lm-states=10000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 30" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --splice-indexes "$splice_indexes" \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --final-layer-normalize-target $final_layer_normalize_target \ + --relu-dim 850 \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_2g.sh b/egs/swbd/s5c/local/chain/run_tdnn_2g.sh new file mode 100755 index 00000000000..db2f7a00410 --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_2g.sh @@ -0,0 +1,239 @@ +#!/bin/bash + +# _2g is as _2f but reducing the --num-lm-states from 10k to 7k +# see table in run_tdnn_2a.sh for results. + +# _2f is as _2d but following a code change, and with different LM options: +# --ngram-order=5 --num-lm-states=10000 +# Now the extra questions are not needed. +# LM perplexity changes from 6.34 to 5.75. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_2g # Note: _sp will get added to this if $speed_perturb == true. + +# TDNN options +splice_indexes="-2,-1,0,1,2 -1,2 -3,3 -6,3 -6,3" + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=30 +max_param_change=1.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set data/lang_chain_d $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{1,2,3,4}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --pdf-boundary-penalty 0.0 \ + --lm-opts "--ngram-order=5 --num-lm-states=7000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 30" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --splice-indexes "$splice_indexes" \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --final-layer-normalize-target $final_layer_normalize_target \ + --relu-dim 850 \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_2h.sh b/egs/swbd/s5c/local/chain/run_tdnn_2h.sh new file mode 100755 index 00000000000..9d5bfdd1207 --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_2h.sh @@ -0,0 +1,241 @@ +#!/bin/bash + +# _2h is as _2g but --ngram-order=4, and --num-lm-states=5k. +# see table in run_tdnn_2a.sh for results. + +# _2g is as _2f but reducing the --num-lm-states from 10k to 7k. + +# _2f is as _2d but following a code change, and with different LM options: +# --ngram-order=5 --num-lm-states=10000 +# Now the extra questions are not needed. +# LM perplexity changes from 6.34 to 5.75. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_2h # Note: _sp will get added to this if $speed_perturb == true. + +# TDNN options +splice_indexes="-2,-1,0,1,2 -1,2 -3,3 -6,3 -6,3" + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=30 +max_param_change=1.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set data/lang_chain_d $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{1,2,3,4}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --pdf-boundary-penalty 0.0 \ + --lm-opts "--ngram-order=4 --num-lm-states=5000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 30" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --splice-indexes "$splice_indexes" \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --final-layer-normalize-target $final_layer_normalize_target \ + --relu-dim 850 \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_2i.sh b/egs/swbd/s5c/local/chain/run_tdnn_2i.sh new file mode 100755 index 00000000000..eaa5a77949f --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_2i.sh @@ -0,0 +1,239 @@ +#!/bin/bash + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# see table in run_tdnn_2a.sh for results + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_2i # Note: _sp will get added to this if $speed_perturb == true. + +# TDNN options +splice_indexes="-2,-1,0,1,2 -1,2 -3,3 -6,3 -6,3" + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=30 +max_param_change=1.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set data/lang_chain_d $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --pdf-boundary-penalty 0.0 \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 30" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --splice-indexes "$splice_indexes" \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --final-layer-normalize-target $final_layer_normalize_target \ + --relu-dim 850 \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_2j.sh b/egs/swbd/s5c/local/chain/run_tdnn_2j.sh new file mode 100755 index 00000000000..70ba86a3fd0 --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_2j.sh @@ -0,0 +1,240 @@ +#!/bin/bash + +# _2j is as _2i but with --num-extra-lm-states=1000, not 2000. +# see table in run_tdnn_2a.sh for results + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_2j # Note: _sp will get added to this if $speed_perturb == true. + +# TDNN options +splice_indexes="-2,-1,0,1,2 -1,2 -3,3 -6,3 -6,3" + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=30 +max_param_change=1.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set data/lang_chain_d $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{1,2,3,4}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --pdf-boundary-penalty 0.0 \ + --lm-opts "--num-extra-lm-states=1000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 30" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --splice-indexes "$splice_indexes" \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --final-layer-normalize-target $final_layer_normalize_target \ + --relu-dim 850 \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_2k.sh b/egs/swbd/s5c/local/chain/run_tdnn_2k.sh new file mode 100755 index 00000000000..fb1f59d3c5a --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_2k.sh @@ -0,0 +1,249 @@ +#!/bin/bash + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. +# +# This is helpful more often than not (but it doesn't seem to make as much +# of a difference as it did before). +# 2i 2k +# train_dev,tg 17.44 17.08 +# train_dev,fg 16.09 15.79 +# eval2000,tg 19.2 19.3 +# eval2000,fg 17.3 17.3 + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_2k # Note: _sp will get added to this if $speed_perturb == true. + +# TDNN options +splice_indexes="-2,-1,0,1,2 -1,2 -3,3 -6,3 -6,3" + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=30 +max_param_change=1.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set data/lang_chain_d $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + # note, I removed the --pdf-boundary 0.0 option after taking it out of the script + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --apply-deriv-weights false \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --splice-indexes "$splice_indexes" \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --final-layer-normalize-target $final_layer_normalize_target \ + --relu-dim 850 \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_2l.sh b/egs/swbd/s5c/local/chain/run_tdnn_2l.sh new file mode 100755 index 00000000000..56365029f3c --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_2l.sh @@ -0,0 +1,259 @@ +#!/bin/bash + +# _2l is as _2k, but using 100 frames per eg instead of 150. +# Previously we had found 150 better than 75, but this may have changed as we +# are no longer treating the edges in the same way (e.g. we now use +# --pdf-boundary-penalty=0.0). So re-tuning. + +# This is: [better by 0.1, better by 0.1, the same, worse by 0.1]. So +# I guess it's either not sensitive to this, or the optimal value lies +# somewhere in between. I'm leaving it at 150 in the scripts for +# now, but if we have memory problems in the future, we can reduce to 100. +# +# 2k 2l +# --frames-per-eg 150 100 +# train_dev,tg 17.08 16.99 +# train_dev,fg 15.79 15.67 +# eval2000,tg 19.3 19.3 +# eval2000,fg 17.3 17.4 + + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_2l # Note: _sp will get added to this if $speed_perturb == true. + +# TDNN options +splice_indexes="-2,-1,0,1,2 -1,2 -3,3 -6,3 -6,3" + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=30 +max_param_change=1.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=100 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set data/lang_chain_d $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{1,2,3,4}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --pdf-boundary-penalty 0.0 \ + --apply-deriv-weights false \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --splice-indexes "$splice_indexes" \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --final-layer-normalize-target $final_layer_normalize_target \ + --relu-dim 850 \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_2m.sh b/egs/swbd/s5c/local/chain/run_tdnn_2m.sh new file mode 100755 index 00000000000..93ba4ac82b3 --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_2m.sh @@ -0,0 +1,260 @@ +#!/bin/bash + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# So it's [0.1 worse, 0.1 worse, 0.6 better, 0.3 better]: better on average. +# Which kind of makes sense (we expected that the previous limitation on how the +# tree was built would not be helpful). + +# 2k 2m +# --leftmost-questions-truncate 30 -1 +# train_dev,tg 17.08 17.22 +# train_dev,fg 15.79 15.87 +# eval2000,tg 19.3 18.7 +# eval2000,fg 17.3 17.0 +# in tree-building, +# like-impr 4.9099 5.33844 +# Den-fst num-states 35460 299068 +# Den-fst num-arcs 47036 331403 + + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=11 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_2m # Note: _sp will get added to this if $speed_perturb == true. + +# TDNN options +splice_indexes="-2,-1,0,1,2 -1,2 -3,3 -6,3 -6,3" + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=1.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set data/lang_chain_d $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --pdf-boundary-penalty 0.0 \ + --apply-deriv-weights false \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --splice-indexes "$splice_indexes" \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --final-layer-normalize-target $final_layer_normalize_target \ + --relu-dim 850 \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_2n.sh b/egs/swbd/s5c/local/chain/run_tdnn_2n.sh new file mode 100755 index 00000000000..c90c5f0a41f --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_2n.sh @@ -0,0 +1,301 @@ +#!/bin/bash + +# _2n is as _2m, but using the combine-data script to ensure that we don't have +# very short segments (this can cause an excessive amount of either missing or +# overlapped data in the egs). + +# (m->n) This doesn't seem to make a consistent difference, but maybe a little worse. +# Note, the tree-split improvement was more in 2n. I suspect this it's because we +# did the alignments after the 'max1' thing, and the fMLLR was somehow more +# utterance-specific. + +# WER on 2m 2n +# train_dev,tg 17.22 17.11 0.1 better +# train_dev,fg 15.87 15.75 0.1 better +# eval2000,tg 18.7 19.2 0.5 worse +# eval2000,fg 17.0 17.2 0.2 worse +# +# tree-split impr 5.34 5.78 +# train-prob,final -0.080 -0.090 +# valid-prob,final -0.116 -0.1006 # note, the 2n valid prob is not correct, because +# # the combine_data.sh script doesn't preserve utt2uniq info. + +# (note: I removed the --pdf-boundary-penalty 0.0 option from the script as it's +# now the default, and no longer supported.) + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=9 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_2n # Note: _sp will get added to this if $speed_perturb == true. + + +# TDNN options +splice_indexes="-2,-1,0,1,2 -1,2 -3,3 -6,3 -6,3" + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=1.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=true +min_segment_length=8 # min length in seconds, for combining data. + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 14 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set data/lang_chain_d $ali_dir $treedir +fi + +if [ $stage -le 15 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --apply-deriv-weights false \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --splice-indexes "$splice_indexes" \ + --feat-type raw \ + --online-ivector-dir exp/chain/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --final-layer-normalize-target $final_layer_normalize_target \ + --relu-dim 850 \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/$train_set_hires $treedir exp/tri4_lats_${train_set} $dir || exit 1; +fi + +if [ $stage -le 16 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 17 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_2o.sh b/egs/swbd/s5c/local/chain/run_tdnn_2o.sh new file mode 100755 index 00000000000..5a8166acbf7 --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_2o.sh @@ -0,0 +1,259 @@ +#!/bin/bash + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# Correction: after rerunning, it actually seems a little worse. +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# WER on 2m 2o 2o[rerun after delete] +# train_dev,tg 17.22 17.24 17.19 +# train_dev,fg 15.87 15.93 15.89 +# eval2000,tg 18.7 18.7 19.3 +# eval2000,fg 17.0 16.9 17.4 + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=10 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_2o # Note: _sp will get added to this if $speed_perturb == true. + +# TDNN options +splice_indexes="-2,-1,0,1,2 -1,2 -3,3 -6,3 -6,3" + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=1.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --apply-deriv-weights false \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --splice-indexes "$splice_indexes" \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --final-layer-normalize-target $final_layer_normalize_target \ + --relu-dim 850 \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_2p.sh b/egs/swbd/s5c/local/chain/run_tdnn_2p.sh new file mode 100755 index 00000000000..3ff85ad5562 --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_2p.sh @@ -0,0 +1,274 @@ +#!/bin/bash + +# _2p is as _2m, but 6500 instead of 9000 as the target for num-leaves. + +# consistently slightly worse. + +# WER on 2m 2p +# train_dev,tg 17.22 17.42 0.2 worse +# train_dev,fg 15.87 16.07 0.2 worse +# eval2000,tg 18.7 19.0 0.3 worse +# eval2000,fg 17.0 17.1 0.1 worse +# +# oddly, the final train and valid probs were better. +# final-train -0.0803 -0.0791 +# final-valid -0.0116 -0.0115 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. +# Caution: I accidentally overwrote its treedir with the '2o' experiment, so I +# moved it to '2o'. But the 2m experiment was done by then. + +# So it's [0.1 worse, 0.1 worse, 0.6 better, 0.3 better]: better on average. +# Which kind of makes sense +# +# 2k 2m +# --leftmost-questions-truncate 30 -1 +# train_dev,tg 17.08 17.22 +# train_dev,fg 15.79 15.87 +# eval2000,tg 19.3 18.7 +# eval2000,fg 17.3 17.0 +# in tree-building, +# like-impr 4.9099 5.33844 +# Den-fst num-states 35460 299068 +# Den-fst num-arcs 47036 331403 + + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=11 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_2p # Note: _sp will get added to this if $speed_perturb == true. + +# TDNN options +splice_indexes="-2,-1,0,1,2 -1,2 -3,3 -6,3 -6,3" + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=1.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 6500 data/$train_set data/lang_chain_d $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --apply-deriv-weights false \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --splice-indexes "$splice_indexes" \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --final-layer-normalize-target $final_layer_normalize_target \ + --relu-dim 850 \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_2q.sh b/egs/swbd/s5c/local/chain/run_tdnn_2q.sh new file mode 100755 index 00000000000..2c7669cdbc4 --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_2q.sh @@ -0,0 +1,268 @@ +#!/bin/bash + +# _2q is as _2o but changing from 9000 -> 6000 states as the target. +# (like 2p, where it wasn't helpful, but doing this experiment for the topology with fewer state). + +# it's consistently a little worse. +# WER on 2o 2q +# train_dev,tg 17.24 17.43 0.2% worse +# train_dev,fg 15.93 16.07 0.2% worse +# eval2000,tg 18.7 19.0 0.3% worse +# eval2000,fg 16.9 17.1 0.2% worse +# train-prob -0.08352 -0.08441 +# valid-prob -0.1218 -0.01221 + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! + +# WER on 2m 2o +# train_dev,tg 17.22 17.24 no diff +# train_dev,fg 15.87 15.93 no diff +# eval2000,tg 18.7 18.7 no diff +# eval2000,fg 17.0 16.9 0.1 better + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=11 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_2q # Note: _sp will get added to this if $speed_perturb == true. + +# TDNN options +splice_indexes="-2,-1,0,1,2 -1,2 -3,3 -6,3 -6,3" + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=1.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 6000 data/$train_set data/lang_chain_o $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --apply-deriv-weights false \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --splice-indexes "$splice_indexes" \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --final-layer-normalize-target $final_layer_normalize_target \ + --relu-dim 850 \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_2r.sh b/egs/swbd/s5c/local/chain/run_tdnn_2r.sh new file mode 100755 index 00000000000..d17ebdf9be7 --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_2r.sh @@ -0,0 +1,304 @@ +#!/bin/bash + +# _2r is as _2q, but further changing the topology to have one rather than +# two pdf-ids per triphone. + +# it's consistently worse, and a fairly substantial difference. +# WER on 2q 2r +# train_dev,tg 17.43 17.82 0.4% worse +# train_dev,fg 16.07 16.64 0.6% worse +# eval2000,tg 19.0 19.8 0.8% worse +# eval2000,fg 17.1 18.0 0.9% worse +# train-prob -0.08441 -0.08318 +# valid-prob -0.01221 -0.1272 + + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! + +# WER on 2m 2o +# train_dev,tg 17.22 17.24 no diff +# train_dev,fg 15.87 15.93 no diff +# eval2000,tg 18.7 18.7 no diff +# eval2000,fg 17.0 16.9 0.1 better + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=10 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_2r # Note: _sp will get added to this if $speed_perturb == true. + +# TDNN options +splice_indexes="-2,-1,0,1,2 -1,2 -3,3 -6,3 -6,3" + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=1.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. The "--pdf-class-list=0" option is + # needed, as in this type of topology we only have a single pdf-class, + # numbered zero. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --cluster-phones-opts "--pdf-class-list=0" \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 6000 data/$train_set data/lang_chain_2r $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --apply-deriv-weights false \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --splice-indexes "$splice_indexes" \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --final-layer-normalize-target $final_layer_normalize_target \ + --relu-dim 850 \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; + + +# Just a note: I saw some warnings like this in the logs: + +WARNING (nnet3-chain-normalize-egs:main():nnet3-chain-normalize-egs.cc:72) For example sp1.0-sw02859-B_050239-051084-0, FST was empty after composing with normalization FST. This should be extremely rare (a few per corpus, at most) + +#below is how I verified that they were caused by a benign cause.. it was that the lattice versus +#1-best alignment had different paths (and presumably the lattice didn't have the same path +#contained in the 1-best. +# +# after the first ow_S we have a silence in the 1-best: + +copy-int-vector 'ark:gunzip -c exp/chain/tri5r_tree_sp/ali.45.gz |' ark,t:- | grep sp1.0-sw02859-B_050239-051084 | ali-to-phones exp/chain/tri5r_tree_sp/final.mdl ark:- ark,t:- | utils/int2sym.pl -f 2- data/lang/phones.txt +copy-int-vector 'ark:gunzip -c exp/chain/tri5r_tree_sp/ali.45.gz |' ark,t:- +ali-to-phones exp/chain/tri5r_tree_sp/final.mdl ark:- ark,t:- +LOG (copy-int-vector:main():copy-int-vector.cc:83) Copied 5884 vectors of int32. +LOG (ali-to-phones:main():ali-to-phones.cc:134) Done 1 utterances. +sp1.0-sw02859-B_050239-051084 sil ow_S sil ay_B k_I m_I ax_I n_E hh_B ih_I m_I s_I eh_I l_I f_E ih_B f_E hh_B iy_E hh_B ae_I d_E s_B ah_I m_E t_B ae_I l_I ih_I n_I t_E ax_B r_I aw_I n_I d_E ay_S th_B ih_I ng_I k_E dh_B ey_I d_E b_B iy_E ax_S s_B uw_I p_I er_E t_B iy_I m_E b_B ah_I t_E hh_B iy_E k_B ae_I n_I t_E d_B uw_E ih_B t_E b_B ay_E hh_B ih_I m_I s_I eh_I l_I f_E hh_B iy_I z_E g_B aa_I t_E t_B ax_E hh_B ae_I v_E ax_S l_B ay_I n_E ih_B n_E f_B r_I ah_I n_I t_E ah_B v_E hh_B ih_I m_E dh_B ae_I t_E n_B ow_I z_E hh_B aw_E t_B ax_E b_B l_I aa_I k_E sil + + +# but in the lattice (which seems to be linear at that point), after the first +# ow_S there is no silence: + +lattice-copy "ark:gunzip -c exp/tri4_lats_nodup_sp/lat.45.gz |" "scp,t,p:echo sp1.0-sw02859-B_050239-051084 -|" | lattice-best-path "scp:echo sp1.0-sw02859-B_050239-051084 -|" ark:/dev/null ark,t:- | ali-to-phones exp/tri4/final.mdl ark:- ark,t:- | utils/int2sym.pl -f 2- data/lang/phones.txt +lattice-copy 'ark:gunzip -c exp/tri4_lats_nodup_sp/lat.45.gz |' 'scp,t,p:echo sp1.0-sw02859-B_050239-051084 -|' +lattice-best-path 'scp:echo sp1.0-sw02859-B_050239-051084 -|' ark:/dev/null ark,t:- +ali-to-phones exp/tri4/final.mdl ark:- ark,t:- +LOG (lattice-best-path:main():lattice-best-path.cc:99) For utterance sp1.0-sw02859-B_050239-051084, best cost 53.7031 + 39521.9 = 39575.6 over 843 frames. +LOG (lattice-best-path:main():lattice-best-path.cc:124) Overall score per frame is 46.9461 = 0.0637047 [graph] + 46.8824 [acoustic] over 843 frames. +LOG (lattice-best-path:main():lattice-best-path.cc:128) Done 1 lattices, failed for 0 +LOG (ali-to-phones:main():ali-to-phones.cc:134) Done 1 utterances. +sp1.0-sw02859-B_050239-051084 sil ow_S ay_B k_I m_I ax_I n_E hh_B ih_I m_I s_I eh_I l_I f_E ih_B f_E hh_B iy_E hh_B ae_I d_E s_B ah_I m_E t_B ae_I l_I ih_I n_I t_E ax_B r_I aw_I n_I d_E ay_S th_B ih_I ng_I k_E dh_B ey_I d_E b_B iy_E ax_S s_B uw_I p_I er_E t_B iy_I m_E b_B ah_I t_E hh_B iy_E k_B ae_I n_I t_E d_B uw_E ih_B t_E b_B ay_E hh_B ih_I m_I s_I eh_I l_I f_E hh_B iy_I z_E g_B aa_I t_E t_B ax_E hh_B ae_I v_E ax_S l_B ay_I n_E ih_B n_E f_B r_I ah_I n_I t_E ah_B v_E hh_B ih_I m_E dh_B ae_I t_E n_B ow_I z_E hh_B aw_E t_B ax_E b_B l_I aa_I k_E sil diff --git a/egs/swbd/s5c/local/chain/run_tdnn_2s.sh b/egs/swbd/s5c/local/chain/run_tdnn_2s.sh new file mode 100755 index 00000000000..6f7f9978ac6 --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_2s.sh @@ -0,0 +1,260 @@ +#!/bin/bash + +# _2s is as _2o, but another topology, this time with 3 states and 3 pdf-ids +# worse :-( + +# WER on 2o 2s +# train_dev,tg 17.24 17.19 no diff +# train_dev,fg 15.93 15.97 no diff +# eval2000,tg 18.7 19.0 0.3 worse +# eval2000,fg 16.9 17.2 0.3 worse +# + + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! + + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=10 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_2s # Note: _sp will get added to this if $speed_perturb == true. + +# TDNN options +splice_indexes="-2,-1,0,1,2 -1,2 -3,3 -6,3 -6,3" + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=1.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --apply-deriv-weights false \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --splice-indexes "$splice_indexes" \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --final-layer-normalize-target $final_layer_normalize_target \ + --relu-dim 850 \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_2t.sh b/egs/swbd/s5c/local/chain/run_tdnn_2t.sh new file mode 100755 index 00000000000..53a343d9f80 --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_2t.sh @@ -0,0 +1,264 @@ +#!/bin/bash + +# _2t is as _2o and _2s, but another topology: with 3 pdf-ids like 2s, but +# differently arranged. +# see table below, it's worse. + +#[ _2s is as _2o, but another topology, this time with 3 states and 3 pdf-ids +# worse :-(] + +# WER on 2o 2s 2t +# train_dev,tg 17.24 17.19 17.44 +# train_dev,fg 15.93 15.97 +# eval2000,tg 18.7 19.0 19.4 +# eval2000,fg 16.9 17.2 +# + + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! + + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=10 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_2t # Note: _sp will get added to this if $speed_perturb == true. + +# TDNN options +splice_indexes="-2,-1,0,1,2 -1,2 -3,3 -6,3 -6,3" + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=1.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --apply-deriv-weights false \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --splice-indexes "$splice_indexes" \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --final-layer-normalize-target $final_layer_normalize_target \ + --relu-dim 850 \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_2u.sh b/egs/swbd/s5c/local/chain/run_tdnn_2u.sh new file mode 100755 index 00000000000..c05fb697d6f --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_2u.sh @@ -0,0 +1,276 @@ +#!/bin/bash + +# _2u is as _2o, but using 'not-shared' in the roots files, to ensure that +# the initial and non-initial states will never be shared. I don't expect this +# to make any difference, as that question always gets asked, but it's a baseline for _2v. + + +# If anything, it's a little worse. + +# WER on 2o 2u +# train_dev,tg 17.24 17.23 no diff +# train_dev,fg 15.93 15.98 no diff +# eval2000,tg 18.7 19.3 0.6% worse +# eval2000,fg 16.9 17.3 0.4% worse + + + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# WER on 2m 2o +# train_dev,tg 17.22 17.24 no diff +# train_dev,fg 15.87 15.93 no diff +# eval2000,tg 18.7 18.7 no diff +# eval2000,fg 17.0 16.9 0.1 better + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=10 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_2u # Note: _sp will get added to this if $speed_perturb == true. + +# TDNN options +splice_indexes="-2,-1,0,1,2 -1,2 -3,3 -6,3 -6,3" + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=1.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo + # use 'not-shared' roots so initial and non-initial pdf-ids cannot be the + # same. + awk '{$1 = "not-shared"; print;}' $lang/phones/roots.txt + awk '{$1 = "not-shared"; print;}' $lang/phones/roots.int +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --apply-deriv-weights false \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --splice-indexes "$splice_indexes" \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --final-layer-normalize-target $final_layer_normalize_target \ + --relu-dim 850 \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_2v.sh b/egs/swbd/s5c/local/chain/run_tdnn_2v.sh new file mode 100755 index 00000000000..3d279841190 --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_2v.sh @@ -0,0 +1,281 @@ +#!/bin/bash + +# _2v is as _2u, but adding the --scale-stats-for-pdf-classes="1=0.5" option to +# the tree building, to scale down the stats for the self-loop to have fewer pdf-ids +# assigned there and more to the initial state. + +# It's maybe a shade better than 2u, but certainly not better than 2o. I don't +# think I'll pursue this. Note: the code and the script option may not be +# checked in, and won't be checked in with this commit. + +# WER on 2o 2u 2v +# train_dev,tg 17.24 17.23 17.28 0.05% worse than 2u +# train_dev,fg 15.93 15.98 16.05 0.05% worse than 2u +# eval2000,tg 18.7 19.3 19.1 0.2% better than 2u +# eval2000,fg 16.9 17.3 17.1 0.2% better than 2u. + + +# _2u is as _2o, but using 'not-shared' in the roots files, to ensure that +# the initial and non-initial states will never be shared. I don't expect this +# to make any difference, as that question always gets asked, but it's a baseline for _2v. + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# WER on 2m 2o +# train_dev,tg 17.22 17.24 no diff +# train_dev,fg 15.87 15.93 no diff +# eval2000,tg 18.7 18.7 no diff +# eval2000,fg 17.0 16.9 0.1 better + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=10 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_2v # Note: _sp will get added to this if $speed_perturb == true. + +# TDNN options +splice_indexes="-2,-1,0,1,2 -1,2 -3,3 -6,3 -6,3" + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=1.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo + # use 'not-shared' roots so initial and non-initial pdf-ids cannot be the + # same. + awk '{$1 = "not-shared"; print;}' $lang/phones/roots.txt + awk '{$1 = "not-shared"; print;}' $lang/phones/roots.int +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --scale-stats-for-pdf-classes "1=0.5" \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --apply-deriv-weights false \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --splice-indexes "$splice_indexes" \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --final-layer-normalize-target $final_layer_normalize_target \ + --relu-dim 850 \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_2w.sh b/egs/swbd/s5c/local/chain/run_tdnn_2w.sh new file mode 100755 index 00000000000..bcfc93aadb0 --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_2w.sh @@ -0,0 +1,276 @@ +#!/bin/bash + +# _2w is as _2o, but setting the frame subsampling factor to 2 instead of 3. +# Going back to 100 frames per eg, which I previously found to be about the same in +# WER, because we were running out of memory [although this is before a code +# change to use reorder=false, which halved the num-states in the graph on this setup +# [~45k->22k], and reduced the num-transitions to a quarter [900k->225k]. + + +# a little surprisingly, it's worse, and clearly so. +# note, we can't really compare the objf values, as the chunk size is not the same. + +# WER on 2m 2o 2w +# train_dev,tg 17.22 17.24 17.62 +# train_dev,fg 15.87 15.93 16.49 +# eval2000,tg 18.7 18.7 19.4 +# eval2000,fg 17.0 16.9 17.8 + + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# WER on 2m 2o +# train_dev,tg 17.22 17.24 no diff +# train_dev,fg 15.87 15.93 no diff +# eval2000,tg 18.7 18.7 no diff +# eval2000,fg 17.0 16.9 0.1 better + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=10 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_2w # Note: _sp will get added to this if $speed_perturb == true. + +# TDNN options +splice_indexes="-2,-1,0,1,2 -1,2 -3,3 -6,3 -6,3" + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=1.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=100 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor $frame_subsampling_factor \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --frame-subsampling-factor $frame_subsampling_factor \ + --apply-deriv-weights false \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --splice-indexes "$splice_indexes" \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --final-layer-normalize-target $final_layer_normalize_target \ + --relu-dim 850 \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_2x.sh b/egs/swbd/s5c/local/chain/run_tdnn_2x.sh new file mode 100755 index 00000000000..bff0983bd49 --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_2x.sh @@ -0,0 +1,282 @@ +#!/bin/bash + +# _2x is as _2w (which has frame subsampling factor of 2 not 3), but with more +# epochs (6 vs 4), as it looks like the 2w model hadn't completely trained. +# Re-using the egs. I added the results to the table below. The WER is +# even worse than 2x. + +# _2w is as _2o, but setting the frame subsampling factor to 2 instead of 3. +# Going back to 100 frames per eg, which I previously found to be about the same in +# WER, because we were running out of memory [although this is before a code +# change to use reorder=false, which halved the num-states in the graph on this setup +# [~45k->22k], and reduced the num-transitions to a quarter [900k->225k]. + + +# a little surprisingly, it's worse, and clearly so. +# note, we can't really compare the objf values, as the chunk size is not the same. + +# WER on 2m 2o 2w 2x +# train_dev,tg 17.22 17.24 17.62 17.79 +# train_dev,fg 15.87 15.93 16.49 16.57 +# eval2000,tg 18.7 18.7 19.4 19.6 +# eval2000,fg 17.0 16.9 17.8 18.0 + + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# WER on 2m 2o +# train_dev,tg 17.22 17.24 no diff +# train_dev,fg 15.87 15.93 no diff +# eval2000,tg 18.7 18.7 no diff +# eval2000,fg 17.0 16.9 0.1 better + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_2x # Note: _sp will get added to this if $speed_perturb == true. + +# TDNN options +splice_indexes="-2,-1,0,1,2 -1,2 -3,3 -6,3 -6,3" + +# training options +num_epochs=6 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=1.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=100 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor $frame_subsampling_factor \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --egs-dir exp/chain/tdnn_2w_sp/egs \ + --frame-subsampling-factor $frame_subsampling_factor \ + --apply-deriv-weights false \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --splice-indexes "$splice_indexes" \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --final-layer-normalize-target $final_layer_normalize_target \ + --relu-dim 850 \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_2y.sh b/egs/swbd/s5c/local/chain/run_tdnn_2y.sh new file mode 100755 index 00000000000..6d61b7d860d --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_2y.sh @@ -0,0 +1,267 @@ +#!/bin/bash + +# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from +# 800k to 1.2 million. The aim is to avoid some of the per-job overhead +# (model-averaging, etc.), since each iteration takes only a minute or so. +# I added the results to the table below. It seems the same on average- +# which is good. We'll probably keep this configuration. + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# note: when I compare with the rerun of 2o (not shown), this run is actually +# better. +# WER on 2m 2o 2y [ now comparing 2o->2y:] +# train_dev,tg 17.22 17.24 16.99 0.2% better +# train_dev,fg 15.87 15.93 15.86 0.1% better +# eval2000,tg 18.7 18.7 18.9 0.2% worse +# eval2000,fg 17.0 16.9 17.0 0.1% worse + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=10 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_2y # Note: _sp will get added to this if $speed_perturb == true. + +# TDNN options +splice_indexes="-2,-1,0,1,2 -1,2 -3,3 -6,3 -6,3" + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=1.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --apply-deriv-weights false \ + --frames-per-iter 1200000 \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --splice-indexes "$splice_indexes" \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --final-layer-normalize-target $final_layer_normalize_target \ + --relu-dim 850 \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_3c.sh b/egs/swbd/s5c/local/chain/run_tdnn_3c.sh new file mode 100755 index 00000000000..4f350891e8a --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_3c.sh @@ -0,0 +1,274 @@ +#!/bin/bash + +# _3c is as _2y, but using 'jesus' nonlinearity: the --jesus-dim 800 option, instead of +# --relu-dim 850. +# reusing the egs from 2y. +# caution: see config section, I changed some things while running. + +# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from +# 800k to 1.2 million. The aim is to avoid some of the per-job overhead +# (model-averaging, etc.), since each iteration takes only a minute or so. +# I added the results to the table below. It seems the same on average- +# which is good. We'll probably keep this configuration. + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# note: when I compare with the rerun of 2o (not shown), this run is actually +# better. +# WER on 2m 2o 2y [ now comparing 2o->2y:] +# train_dev,tg 17.22 17.24 16.99 0.2% better +# train_dev,fg 15.87 15.93 15.86 0.1% better +# eval2000,tg 18.7 18.7 18.9 0.2% worse +# eval2000,fg 17.0 16.9 17.0 0.1% worse + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_3c # Note: _sp will get added to this if $speed_perturb == true. + +# TDNN options +splice_indexes="-2,-1,0,1,2 -1,2 -3,3 -6,3 -6,3" + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +# max_param_change=1.0 +max_param_change=0.5 # Changed it to this value on iteration 74. +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=64 # switched to 64 on iteration 7 after a failure. +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --egs-dir exp/chain/tdnn_2y_sp/egs \ + --apply-deriv-weights false \ + --frames-per-iter 1200000 \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --splice-indexes "$splice_indexes" \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --final-layer-normalize-target $final_layer_normalize_target \ + --jesus-dim 800 \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_3d.sh b/egs/swbd/s5c/local/chain/run_tdnn_3d.sh new file mode 100755 index 00000000000..ca8080db080 --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_3d.sh @@ -0,0 +1,286 @@ +#!/bin/bash + +# _3d is as _2y, and re-using the egs, but using --jesus-opts and +# configs from make_jesus_configs.py. +# (note: cannot be reproduced using current scripts). +# --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ +# --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" +# Results are about the same as 2y, or maybe just a little worse. + +# a03:s5c: ./show_wer.sh 3d +# %WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0 +# %WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +# %WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 2y +# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + + + +# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from +# 800k to 1.2 million. The aim is to avoid some of the per-job overhead +# (model-averaging, etc.), since each iteration takes only a minute or so. +# I added the results to the table below. It seems the same on average- +# which is good. We'll probably keep this configuration. + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# note: when I compare with the rerun of 2o (not shown), this run is actually +# better. +# WER on 2m 2o 2y [ now comparing 2o->2y:] +# train_dev,tg 17.22 17.24 16.99 0.2% better +# train_dev,fg 15.87 15.93 15.86 0.1% better +# eval2000,tg 18.7 18.7 18.9 0.2% worse +# eval2000,fg 17.0 16.9 17.0 0.1% worse + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_3d # Note: _sp will get added to this if $speed_perturb == true. + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=1.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --egs-dir exp/chain/tdnn_2y_sp/egs \ + --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ + --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" \ + --apply-deriv-weights false \ + --frames-per-iter 1200000 \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --final-layer-normalize-target $final_layer_normalize_target \ + --relu-dim 850 \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_3e.sh b/egs/swbd/s5c/local/chain/run_tdnn_3e.sh new file mode 100755 index 00000000000..af5661b8c85 --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_3e.sh @@ -0,0 +1,275 @@ +#!/bin/bash + +# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default) +# to 200 in order to reduce computation in the Jesus layer. +# (note: cannot be reproduced using current scripts). + +# _3d is as _2y, and re-using the egs, but using --jesus-opts and +# configs from make_jesus_configs.py. +# --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ +# --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" + +# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from +# 800k to 1.2 million. The aim is to avoid some of the per-job overhead +# (model-averaging, etc.), since each iteration takes only a minute or so. +# I added the results to the table below. It seems the same on average- +# which is good. We'll probably keep this configuration. + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# note: when I compare with the rerun of 2o (not shown), this run is actually +# better. +# WER on 2m 2o 2y [ now comparing 2o->2y:] +# train_dev,tg 17.22 17.24 16.99 0.2% better +# train_dev,fg 15.87 15.93 15.86 0.1% better +# eval2000,tg 18.7 18.7 18.9 0.2% worse +# eval2000,fg 17.0 16.9 17.0 0.1% worse + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_3e # Note: _sp will get added to this if $speed_perturb == true. + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=1.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --egs-dir exp/chain/tdnn_2y_sp/egs \ + --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000 --num-jesus-blocks 200" \ + --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" \ + --apply-deriv-weights false \ + --frames-per-iter 1200000 \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --final-layer-normalize-target $final_layer_normalize_target \ + --relu-dim 850 \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_3f.sh b/egs/swbd/s5c/local/chain/run_tdnn_3f.sh new file mode 100755 index 00000000000..f33459f5f08 --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_3f.sh @@ -0,0 +1,283 @@ +#!/bin/bash + + +# _3f is as _3e, but modifying the splicing setup to add (left) recurrence: +# added the :3's in --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" +# (note: cannot be reproduced using current scripts). +# Therefore it's +# no longer really a tdnn, more like an RNN combined with TDNN. BTW, I'm not re-dumping egs with extra +# context, and this isn't really ideal - I want to see if this seems promising first. + +# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default) +# to 200 in order to reduce computation in the Jesus layer. + +# _3d is as _2y, and re-using the egs, but using --jesus-opts and +# configs from make_jesus_configs.py. +# --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ +# --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" + +# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from +# 800k to 1.2 million. The aim is to avoid some of the per-job overhead +# (model-averaging, etc.), since each iteration takes only a minute or so. +# I added the results to the table below. It seems the same on average- +# which is good. We'll probably keep this configuration. + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# note: when I compare with the rerun of 2o (not shown), this run is actually +# better. +# WER on 2m 2o 2y [ now comparing 2o->2y:] +# train_dev,tg 17.22 17.24 16.99 0.2% better +# train_dev,fg 15.87 15.93 15.86 0.1% better +# eval2000,tg 18.7 18.7 18.9 0.2% worse +# eval2000,fg 17.0 16.9 17.0 0.1% worse + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_3f # Note: _sp will get added to this if $speed_perturb == true. + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=1.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --egs-dir exp/chain/tdnn_2y_sp/egs \ + --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000 --num-jesus-blocks 200" \ + --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" \ + --apply-deriv-weights false \ + --frames-per-iter 1200000 \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --final-layer-normalize-target $final_layer_normalize_target \ + --relu-dim 850 \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context 20 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_3g.sh b/egs/swbd/s5c/local/chain/run_tdnn_3g.sh new file mode 100755 index 00000000000..ff1e539306f --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_3g.sh @@ -0,0 +1,303 @@ +#!/bin/bash + +# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found +# to be worse. +# (note: cannot be reproduced using current scripts). +# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence +# is helpful.] +#./show_wer.sh 3g +#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys +#a03:s5c: ./show_wer.sh 2y +#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +#a03:s5c: ./show_wer.sh 3d +#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0 +#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +# _3f is as _3e, but modifying the splicing setup to add (left) recurrence: +# added the :3's in --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" +# Therefore it's +# no longer really a tdnn, more like an RNN combined with TDNN. BTW, I'm not re-dumping egs with extra +# context, and this isn't really ideal - I want to see if this seems promising first. + +# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default) +# to 200 in order to reduce computation in the Jesus layer. + +# _3d is as _2y, and re-using the egs, but using --jesus-opts and +# configs from make_jesus_configs.py. +# --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ +# --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" + +# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from +# 800k to 1.2 million. The aim is to avoid some of the per-job overhead +# (model-averaging, etc.), since each iteration takes only a minute or so. +# I added the results to the table below. It seems the same on average- +# which is good. We'll probably keep this configuration. + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# note: when I compare with the rerun of 2o (not shown), this run is actually +# better. +# WER on 2m 2o 2y [ now comparing 2o->2y:] +# train_dev,tg 17.22 17.24 16.99 0.2% better +# train_dev,fg 15.87 15.93 15.86 0.1% better +# eval2000,tg 18.7 18.7 18.9 0.2% worse +# eval2000,fg 17.0 16.9 17.0 0.1% worse + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_3g # Note: _sp will get added to this if $speed_perturb == true. + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=1.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --egs-dir exp/chain/tdnn_2y_sp/egs \ + --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ + --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" \ + --apply-deriv-weights false \ + --frames-per-iter 1200000 \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --final-layer-normalize-target $final_layer_normalize_target \ + --relu-dim 850 \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context 20 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_3h.sh b/egs/swbd/s5c/local/chain/run_tdnn_3h.sh new file mode 100755 index 00000000000..f0e9efc2ac4 --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_3h.sh @@ -0,0 +1,289 @@ +#!/bin/bash + +# _3h is as _3g but using a different and hopefully better type of recurrence, using +# steps/nnet3/make_jesus_configs_recurrent.py to create the configs. This is more +# similar to LSTMs. +# We're re-using the egs from 2y, which isn't 100% ideal as we'd like some context. + +# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found +# to be worde. + +# _3f is as _3e, but modifying the splicing setup to add (left) recurrence: +# added the :3's in --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" +# Therefore it's +# no longer really a tdnn, more like an RNN combined with TDNN. BTW, I'm not re-dumping egs with extra +# context, and this isn't really ideal - I want to see if this seems promising first. + +# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default) +# to 200 in order to reduce computation in the Jesus layer. + +# _3d is as _2y, and re-using the egs, but using --jesus-opts and +# configs from make_jesus_configs.py. +# --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ +# --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" + +# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from +# 800k to 1.2 million. The aim is to avoid some of the per-job overhead +# (model-averaging, etc.), since each iteration takes only a minute or so. +# I added the results to the table below. It seems the same on average- +# which is good. We'll probably keep this configuration. + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# note: when I compare with the rerun of 2o (not shown), this run is actually +# better. +# WER on 2m 2o 2y [ now comparing 2o->2y:] +# train_dev,tg 17.22 17.24 16.99 0.2% better +# train_dev,fg 15.87 15.93 15.86 0.1% better +# eval2000,tg 18.7 18.7 18.9 0.2% worse +# eval2000,fg 17.0 16.9 17.0 0.1% worse + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_3h # Note: _sp will get added to this if $speed_perturb == true. + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=1.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --egs-dir exp/chain/tdnn_2y_sp/egs \ + --jesus-opts "--jesus-forward-input-dim 600 --jesus-forward-output-dim 1500 --jesus-direct-recurrence-dim 1000 --jesus-projected-recurrence-output-dim 600 --jesus-projected-recurrence-input-dim 300 --jesus-hidden-dim 15000" \ + --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" \ + --apply-deriv-weights false \ + --frames-per-iter 1200000 \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --final-layer-normalize-target $final_layer_normalize_target \ + --relu-dim 850 \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context 20 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_3i.sh b/egs/swbd/s5c/local/chain/run_tdnn_3i.sh new file mode 100755 index 00000000000..876048b5852 --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_3i.sh @@ -0,0 +1,311 @@ +#!/bin/bash + +# _3i is as _3h but after a script fix in which the --final-layer-normalize-target is +# applied, in order to control how fast the final layer's affine component learns. +# also a code fix (the recurrent connections weren't being used; bug in OptionalDescriptor) + +# Here is the original decoding, with frame-per-chunk=50 +#./show_wer.sh 3i +#%WER 18.00 [ 8856 / 49204, 1025 ins, 2376 del, 5455 sub ] exp/chain/tdnn_3i_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 16.52 [ 8129 / 49204, 1084 ins, 1995 del, 5050 sub ] exp/chain/tdnn_3i_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +#%WER 19.8 | 4459 42989 | 82.6 11.9 5.5 2.4 19.8 57.7 | exp/chain/tdnn_3i_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.9 | 4459 42989 | 84.1 10.5 5.5 2.0 17.9 55.3 | exp/chain/tdnn_3i_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys + +# and a newer decoding with frames-per-chunk=100. +# ./show_wer.sh 3i +#%WER 17.86 [ 8787 / 49204, 1015 ins, 2366 del, 5406 sub ] exp/chain/tdnn_3i_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 16.52 [ 8130 / 49204, 1092 ins, 1969 del, 5069 sub ] exp/chain/tdnn_3i_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +#%WER 19.6 | 4459 42989 | 82.5 11.4 6.0 2.2 19.6 57.5 | exp/chain/tdnn_3i_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.8 | 4459 42989 | 84.1 10.4 5.5 2.0 17.8 55.1 | exp/chain/tdnn_3i_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys + +# after initial decoding wasn't great, trying increasing frames-per-chunk from +# 50 to 100. + +# _3h is as _3g but using a different and hopefully better type of recurrence, using +# steps/nnet3/make_jesus_configs_recurrent.py to create the configs. This is more +# similar to LSTMs. +# We're re-using the egs from 2y, which isn't 100% ideal as we'd like some context. + +# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found +# to be worde. + +# _3f is as _3e, but modifying the splicing setup to add (left) recurrence: +# added the :3's in --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" +# Therefore it's +# no longer really a tdnn, more like an RNN combined with TDNN. BTW, I'm not re-dumping egs with extra +# context, and this isn't really ideal - I want to see if this seems promising first. + +# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default) +# to 200 in order to reduce computation in the Jesus layer. + +# _3d is as _2y, and re-using the egs, but using --jesus-opts and +# configs from make_jesus_configs.py. +# --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ +# --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" + +# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from +# 800k to 1.2 million. The aim is to avoid some of the per-job overhead +# (model-averaging, etc.), since each iteration takes only a minute or so. +# I added the results to the table below. It seems the same on average- +# which is good. We'll probably keep this configuration. + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# note: when I compare with the rerun of 2o (not shown), this run is actually +# better. +# WER on 2m 2o 2y [ now comparing 2o->2y:] +# train_dev,tg 17.22 17.24 16.99 0.2% better +# train_dev,fg 15.87 15.93 15.86 0.1% better +# eval2000,tg 18.7 18.7 18.9 0.2% worse +# eval2000,fg 17.0 16.9 17.0 0.1% worse + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_3i # Note: _sp will get added to this if $speed_perturb == true. + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=1.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --egs-dir exp/chain/tdnn_2y_sp/egs \ + --jesus-opts "--jesus-forward-input-dim 600 --jesus-forward-output-dim 1500 --jesus-direct-recurrence-dim 1000 --jesus-projected-recurrence-output-dim 600 --jesus-projected-recurrence-input-dim 300 --jesus-hidden-dim 15000" \ + --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" \ + --apply-deriv-weights false \ + --frames-per-iter 1200000 \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --final-layer-normalize-target $final_layer_normalize_target \ + --relu-dim 850 \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --frames-per-chunk 100 \ + --extra-left-context 20 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_3j.sh b/egs/swbd/s5c/local/chain/run_tdnn_3j.sh new file mode 100755 index 00000000000..faef84e8879 --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_3j.sh @@ -0,0 +1,296 @@ +#!/bin/bash + +# _3j is as _3i but using BlockAffineComponent instead of +# RepeatedAffineComponent in Jesus layers. (see --use-repeated-affine false +# option, which is newly added to the script). + +# _3i is as _3h but after a script fix in which the --final-layer-normalize-target is +# applied, in order to control how fast the final layer's affine component learns. + +# _3h is as _3g but using a different and hopefully better type of recurrence, using +# steps/nnet3/make_jesus_configs_recurrent.py to create the configs. This is more +# similar to LSTMs. +# We're re-using the egs from 2y, which isn't 100% ideal as we'd like some context. + +# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found +# to be worde. + +# _3f is as _3e, but modifying the splicing setup to add (left) recurrence: +# added the :3's in --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" +# Therefore it's +# no longer really a tdnn, more like an RNN combined with TDNN. BTW, I'm not re-dumping egs with extra +# context, and this isn't really ideal - I want to see if this seems promising first. + +# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default) +# to 200 in order to reduce computation in the Jesus layer. + +# _3d is as _2y, and re-using the egs, but using --jesus-opts and +# configs from make_jesus_configs.py. +# --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ +# --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" + +# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from +# 800k to 1.2 million. The aim is to avoid some of the per-job overhead +# (model-averaging, etc.), since each iteration takes only a minute or so. +# I added the results to the table below. It seems the same on average- +# which is good. We'll probably keep this configuration. + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# note: when I compare with the rerun of 2o (not shown), this run is actually +# better. +# WER on 2m 2o 2y [ now comparing 2o->2y:] +# train_dev,tg 17.22 17.24 16.99 0.2% better +# train_dev,fg 15.87 15.93 15.86 0.1% better +# eval2000,tg 18.7 18.7 18.9 0.2% worse +# eval2000,fg 17.0 16.9 17.0 0.1% worse + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_3j # Note: _sp will get added to this if $speed_perturb == true. + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=1.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --egs-dir exp/chain/tdnn_2y_sp/egs \ + --jesus-opts "--jesus-forward-input-dim 600 --jesus-forward-output-dim 1500 --jesus-direct-recurrence-dim 1000 --jesus-projected-recurrence-output-dim 600 --jesus-projected-recurrence-input-dim 300 --jesus-hidden-dim 15000 --use-repeated-affine false" \ + --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" \ + --apply-deriv-weights false \ + --frames-per-iter 1200000 \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --final-layer-normalize-target $final_layer_normalize_target \ + --relu-dim 850 \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context 20 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_3k.sh b/egs/swbd/s5c/local/chain/run_tdnn_3k.sh new file mode 100755 index 00000000000..b869c7b2553 --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_3k.sh @@ -0,0 +1,310 @@ +#!/bin/bash + +# _3k is as _3i, but adding the option --jesus-stddev-scale 0.316 " +# [~sqrt(1/10)], which will make the jesus layer learn about 10 times faster- it +# was previously learning too slow, I think. I also changed the script +# make_jesus_configs_recurrent.py to give the recurrent affine layers an initial +# param-stddev of 0 which will discourage those corresponding input weights in +# the jesus layer from getting small in early iters; and removed the --normalize-target +# option and replaced it with the --final-layer-learning-rate-factor option. + +# # these results are with the non-optimal chunk size of 50 (in 3i, 100 was slightly better): +#%WER 17.86 [ 8787 / 49204, 1015 ins, 2366 del, 5406 sub ] exp/chain/tdnn_3i_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 16.52 [ 8130 / 49204, 1092 ins, 1969 del, 5069 sub ] exp/chain/tdnn_3i_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +#%WER 19.6 | 4459 42989 | 82.5 11.4 6.0 2.2 19.6 57.5 | exp/chain/tdnn_3i_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.8 | 4459 42989 | 84.1 10.4 5.5 2.0 17.8 55.1 | exp/chain/tdnn_3i_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys + +# The following are the corresponding results from 3i, decoded with the same chunk size. +##%WER 18.00 [ 8856 / 49204, 1025 ins, 2376 del, 5455 sub ] exp/chain/tdnn_3i_sp/decode_train_dev_sw1_tg/wer_11_0.0 +##%WER 16.52 [ 8129 / 49204, 1084 ins, 1995 del, 5050 sub ] exp/chain/tdnn_3i_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +##%WER 19.8 | 4459 42989 | 82.6 11.9 5.5 2.4 19.8 57.7 | exp/chain/tdnn_3i_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +##%WER 17.9 | 4459 42989 | 84.1 10.5 5.5 2.0 17.9 55.3 | exp/chain/tdnn_3i_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys + + +# _3i is as _3h but after a script fix in which the --final-layer-normalize-target is +# applied, in order to control how fast the final layer's affine component learns. + +# _3h is as _3g but using a different and hopefully better type of recurrence, using +# steps/nnet3/make_jesus_configs_recurrent.py to create the configs. This is more +# similar to LSTMs. +# We're re-using the egs from 2y, which isn't 100% ideal as we'd like some context. + +# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found +# to be worde. + +# _3f is as _3e, but modifying the splicing setup to add (left) recurrence: +# added the :3's in --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" +# Therefore it's +# no longer really a tdnn, more like an RNN combined with TDNN. BTW, I'm not re-dumping egs with extra +# context, and this isn't really ideal - I want to see if this seems promising first. + +# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default) +# to 200 in order to reduce computation in the Jesus layer. + +# _3d is as _2y, and re-using the egs, but using --jesus-opts and +# configs from make_jesus_configs.py. +# --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ +# --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" + +# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from +# 800k to 1.2 million. The aim is to avoid some of the per-job overhead +# (model-averaging, etc.), since each iteration takes only a minute or so. +# I added the results to the table below. It seems the same on average- +# which is good. We'll probably keep this configuration. + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# note: when I compare with the rerun of 2o (not shown), this run is actually +# better. +# WER on 2m 2o 2y [ now comparing 2o->2y:] +# train_dev,tg 17.22 17.24 16.99 0.2% better +# train_dev,fg 15.87 15.93 15.86 0.1% better +# eval2000,tg 18.7 18.7 18.9 0.2% worse +# eval2000,fg 17.0 16.9 17.0 0.1% worse + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_3k # Note: _sp will get added to this if $speed_perturb == true. + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=1.0 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --egs-dir exp/chain/tdnn_2y_sp/egs \ + --jesus-opts "--jesus-forward-input-dim 600 --jesus-forward-output-dim 1500 --jesus-direct-recurrence-dim 1000 --jesus-projected-recurrence-output-dim 600 --jesus-projected-recurrence-input-dim 300 --jesus-hidden-dim 15000 --jesus-stddev-scale 0.316 --final-layer-learning-rate-factor 0.25" \ + --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" \ + --apply-deriv-weights false \ + --frames-per-iter 1200000 \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context 20 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_3k2.sh b/egs/swbd/s5c/local/chain/run_tdnn_3k2.sh new file mode 100755 index 00000000000..7a016ed2197 --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_3k2.sh @@ -0,0 +1,358 @@ +#!/bin/bash + +# 3k2 is as 3k, but dumping the egs with --extra-left-context 20. +# Also there will have been some script changes in the meantime, +# e.g. possibly nonzero bias-mean; and reduced max-change on mix-up +# iters. + +# log-probs are better than 3k and in fact better than any experiment so far: +# valid -0.115->-0.107, and train -0.077 to -0.074. + +# Here is the WER using the default --frames-per-chunk of 50, and --extra-left-context 20: +#./show_wer.sh 3k2 +#%WER 20.45 [ 10060 / 49204, 988 ins, 3050 del, 6022 sub ] exp/chain/tdnn_3k2_sp/decode_train_dev_sw1_tg/wer_12_0.0 +#%WER 19.02 [ 9359 / 49204, 977 ins, 2877 del, 5505 sub ] exp/chain/tdnn_3k2_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0 +#%WER 22.3 | 4459 42989 | 79.9 12.8 7.3 2.3 22.3 60.2 | exp/chain/tdnn_3k2_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys +#%WER 20.4 | 4459 42989 | 81.5 11.1 7.4 1.9 20.4 58.4 | exp/chain/tdnn_3k2_sp/decode_eval2000_sw1_fsh_fg/score_12_0.0/eval2000_hires.ctm.filt.sys + +#... and here is the WER after changing it to 150, still with --extra-left-context 20: +#./show_wer.sh 3k2 +#%WER 18.91 [ 9306 / 49204, 1076 ins, 2517 del, 5713 sub ] exp/chain/tdnn_3k2_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 17.43 [ 8574 / 49204, 958 ins, 2607 del, 5009 sub ] exp/chain/tdnn_3k2_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0 +#%WER 20.6 | 4459 42989 | 81.7 12.2 6.0 2.4 20.6 58.8 | exp/chain/tdnn_3k2_sp/decode_eval2000_sw1_tg/score_10_0.5/eval2000_hires.ctm.filt.sys +#%WER 18.8 | 4459 42989 | 83.4 10.9 5.6 2.3 18.8 56.0 | exp/chain/tdnn_3k2_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +# the following is --frames-per-chunk 150, --extra-left-context 50 (changing the extra-left-context from 20 to 50 makes it worse): +#./show_wer.sh 3k2 +#%WER 19.46 [ 9574 / 49204, 1134 ins, 2635 del, 5805 sub ] exp/chain/tdnn_3k2_sp/decode_train_dev_sw1_tg/wer_10_0.0 +#%WER 17.87 [ 8792 / 49204, 880 ins, 3011 del, 4901 sub ] exp/chain/tdnn_3k2_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0 +#%WER 21.0 | 4459 42989 | 81.2 12.4 6.3 2.2 21.0 58.6 | exp/chain/tdnn_3k2_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 19.2 | 4459 42989 | 82.7 10.8 6.5 1.9 19.2 56.0 | exp/chain/tdnn_3k2_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys + +# the following is with --frames-per-chunk 150, --extra-left-context 50, --extra-left-context-initial 20. +#./show_wer.sh 3k2 +#%WER 19.10 [ 9400 / 49204, 1116 ins, 2498 del, 5786 sub ] exp/chain/tdnn_3k2_sp/decode_train_dev_sw1_tg/wer_10_0.0 +#%WER 17.54 [ 8628 / 49204, 884 ins, 2890 del, 4854 sub ] exp/chain/tdnn_3k2_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0 +#%WER 20.6 | 4459 42989 | 81.7 12.2 6.1 2.3 20.6 58.4 | exp/chain/tdnn_3k2_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 18.7 | 4459 42989 | 83.4 10.8 5.8 2.1 18.7 55.6 | exp/chain/tdnn_3k2_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +# the following is with --extra-left-context-initial 20 --extra-left-context 50 --frames-per-chunk 100. +# I think what's happening is that it's figuring out when it's near the end of the chunk, and encouraging +# deletions at that point, for reasons that relate to edge effects in the objective function. +#./show_wer.sh 3k2 +#%WER 17.87 [ 8793 / 49204, 1061 ins, 2277 del, 5455 sub ] exp/chain/tdnn_3k2_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 16.36 [ 8049 / 49204, 1033 ins, 2148 del, 4868 sub ] exp/chain/tdnn_3k2_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 19.7 | 4459 42989 | 82.8 11.8 5.5 2.5 19.7 57.8 | exp/chain/tdnn_3k2_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.8 | 4459 42989 | 84.4 10.3 5.2 2.2 17.8 54.7 | exp/chain/tdnn_3k2_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + + +# _3k is as _3i, but adding the option --jesus-stddev-scale 0.316 " +# [~sqrt(1/10)], which will make the jesus layer learn about 10 times faster- it +# was previously learning too slow, I think. I also changed the script +# make_jesus_configs_recurrent.py to give the recurrent affine layers an initial +# param-stddev of 0 which will discourage those corresponding input weights in +# the jesus layer from getting small in early iters; and removed the --normalize-target +# option and replaced it with the --final-layer-learning-rate-factor option. + +# # these results are with the non-optimal chunk size of 50 (in 3i, 100 was slightly better): +#%WER 17.86 [ 8787 / 49204, 1015 ins, 2366 del, 5406 sub ] exp/chain/tdnn_3i_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 16.52 [ 8130 / 49204, 1092 ins, 1969 del, 5069 sub ] exp/chain/tdnn_3i_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +#%WER 19.6 | 4459 42989 | 82.5 11.4 6.0 2.2 19.6 57.5 | exp/chain/tdnn_3i_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.8 | 4459 42989 | 84.1 10.4 5.5 2.0 17.8 55.1 | exp/chain/tdnn_3i_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys + +# The following are the corresponding results from 3i, decoded with the same chunk size. +##%WER 18.00 [ 8856 / 49204, 1025 ins, 2376 del, 5455 sub ] exp/chain/tdnn_3i_sp/decode_train_dev_sw1_tg/wer_11_0.0 +##%WER 16.52 [ 8129 / 49204, 1084 ins, 1995 del, 5050 sub ] exp/chain/tdnn_3i_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +##%WER 19.8 | 4459 42989 | 82.6 11.9 5.5 2.4 19.8 57.7 | exp/chain/tdnn_3i_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +##%WER 17.9 | 4459 42989 | 84.1 10.5 5.5 2.0 17.9 55.3 | exp/chain/tdnn_3i_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys + + +# _3i is as _3h but after a script fix in which the --final-layer-normalize-target is +# applied, in order to control how fast the final layer's affine component learns. + +# _3h is as _3g but using a different and hopefully better type of recurrence, using +# steps/nnet3/make_jesus_configs_recurrent.py to create the configs. This is more +# similar to LSTMs. +# We're re-using the egs from 2y, which isn't 100% ideal as we'd like some context. + +# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found +# to be worde. + +# _3f is as _3e, but modifying the splicing setup to add (left) recurrence: +# added the :3's in --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" +# Therefore it's +# no longer really a tdnn, more like an RNN combined with TDNN. BTW, I'm not re-dumping egs with extra +# context, and this isn't really ideal - I want to see if this seems promising first. + +# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default) +# to 200 in order to reduce computation in the Jesus layer. + +# _3d is as _2y, and re-using the egs, but using --jesus-opts and +# configs from make_jesus_configs.py. +# --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ +# --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" + +# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from +# 800k to 1.2 million. The aim is to avoid some of the per-job overhead +# (model-averaging, etc.), since each iteration takes only a minute or so. +# I added the results to the table below. It seems the same on average- +# which is good. We'll probably keep this configuration. + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# note: when I compare with the rerun of 2o (not shown), this run is actually +# better. +# WER on 2m 2o 2y [ now comparing 2o->2y:] +# train_dev,tg 17.22 17.24 16.99 0.2% better +# train_dev,fg 15.87 15.93 15.86 0.1% better +# eval2000,tg 18.7 18.7 18.9 0.2% worse +# eval2000,fg 17.0 16.9 17.0 0.1% worse + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_3k2 # Note: _sp will get added to this if $speed_perturb == true. + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=1.0 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --extra-left-context 20 \ + --jesus-opts "--jesus-forward-input-dim 600 --jesus-forward-output-dim 1500 --jesus-direct-recurrence-dim 1000 --jesus-projected-recurrence-output-dim 600 --jesus-projected-recurrence-input-dim 300 --jesus-hidden-dim 15000 --jesus-stddev-scale 0.316 --final-layer-learning-rate-factor 0.25" \ + --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" \ + --apply-deriv-weights false \ + --frames-per-iter 1200000 \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context-initial 20 \ + --extra-left-context 50 \ + --frames-per-chunk 100 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_3l.sh b/egs/swbd/s5c/local/chain/run_tdnn_3l.sh new file mode 100755 index 00000000000..608e437659e --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_3l.sh @@ -0,0 +1,306 @@ +#!/bin/bash + +# [abandoned, not working well.] +# _3l is as _3j, but making similar changes to as 3i->3k, which is (1) adding +# the option --jesus-stddev-scale 0.2 [0.32 was not strong enough], and (2) a +# script change to give the recurrent affine layers an initial param-stddev of +# 0. I also changed the script +# make_jesus_configs_recurrent.py to give the recurrent affine layers an initial +# param-stddev of 0 which will discourage those corresponding input weights in +# the jesus layer from getting small in early iters; and removed the --normalize-target +# option and replaced it with the --final-layer-learning-rate-factor option; +# and added a learning-rate factor for + +# _3j is as _3i but using BlockAffineComponent instead of +# RepeatedAffineComponent in Jesus layers. (see --use-repeated-affine false +# option, which is newly added to the script). + +# _3i is as _3h but after a script fix in which the --final-layer-normalize-target is +# applied, in order to control how fast the final layer's affine component learns. + +# _3h is as _3g but using a different and hopefully better type of recurrence, using +# steps/nnet3/make_jesus_configs_recurrent.py to create the configs. This is more +# similar to LSTMs. +# We're re-using the egs from 2y, which isn't 100% ideal as we'd like some context. + +# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found +# to be worde. + +# _3f is as _3e, but modifying the splicing setup to add (left) recurrence: +# added the :3's in --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" +# Therefore it's +# no longer really a tdnn, more like an RNN combined with TDNN. BTW, I'm not re-dumping egs with extra +# context, and this isn't really ideal - I want to see if this seems promising first. + +# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default) +# to 200 in order to reduce computation in the Jesus layer. + +# _3d is as _2y, and re-using the egs, but using --jesus-opts and +# configs from make_jesus_configs.py. +# --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ +# --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" + +# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from +# 800k to 1.2 million. The aim is to avoid some of the per-job overhead +# (model-averaging, etc.), since each iteration takes only a minute or so. +# I added the results to the table below. It seems the same on average- +# which is good. We'll probably keep this configuration. + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# note: when I compare with the rerun of 2o (not shown), this run is actually +# better. +# WER on 2m 2o 2y [ now comparing 2o->2y:] +# train_dev,tg 17.22 17.24 16.99 0.2% better +# train_dev,fg 15.87 15.93 15.86 0.1% better +# eval2000,tg 18.7 18.7 18.9 0.2% worse +# eval2000,fg 17.0 16.9 17.0 0.1% worse + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_3l # Note: _sp will get added to this if $speed_perturb == true. + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=1.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --egs-dir exp/chain/tdnn_2y_sp/egs \ + --jesus-opts "--jesus-forward-input-dim 600 --jesus-forward-output-dim 1500 --jesus-direct-recurrence-dim 1000 --jesus-projected-recurrence-output-dim 600 --jesus-projected-recurrence-input-dim 300 --jesus-hidden-dim 15000 --use-repeated-affine false --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \ + --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" \ + --apply-deriv-weights false \ + --frames-per-iter 1200000 \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --relu-dim 850 \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context 20 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_3m.sh b/egs/swbd/s5c/local/chain/run_tdnn_3m.sh new file mode 100755 index 00000000000..b25f9f15130 --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_3m.sh @@ -0,0 +1,310 @@ +#!/bin/bash + +# [note: this uses BlockAffineComponent not RepeatedAffineComponent] +# _3m is as _3l, but changing --jesus-stddev-scale from 0.2 to 0.1, as the Jesus layers +# were learning too slowly in 3l (this will make them learn approximately 4x faster). +# [terminated, likelihoods were not promising]. + +# _3l is as _3j, but making similar changes to as 3i->3k, which is (1) adding +# the option --jesus-stddev-scale 0.2 [0.32 was not strong enough], and (2) a +# script change to give the recurrent affine layers an initial param-stddev of +# 0. I also changed the script +# make_jesus_configs_recurrent.py to give the recurrent affine layers an initial +# param-stddev of 0 which will discourage those corresponding input weights in +# the jesus layer from getting small in early iters; and removed the --normalize-target +# option and replaced it with the --final-layer-learning-rate-factor option; +# and added a learning-rate factor for + +# _3j is as _3i but using BlockAffineComponent instead of +# RepeatedAffineComponent in Jesus layers. (see --use-repeated-affine false +# option, which is newly added to the script). + +# _3i is as _3h but after a script fix in which the --final-layer-normalize-target is +# applied, in order to control how fast the final layer's affine component learns. + +# _3h is as _3g but using a different and hopefully better type of recurrence, using +# steps/nnet3/make_jesus_configs_recurrent.py to create the configs. This is more +# similar to LSTMs. +# We're re-using the egs from 2y, which isn't 100% ideal as we'd like some context. + +# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found +# to be worde. + +# _3f is as _3e, but modifying the splicing setup to add (left) recurrence: +# added the :3's in --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" +# Therefore it's +# no longer really a tdnn, more like an RNN combined with TDNN. BTW, I'm not re-dumping egs with extra +# context, and this isn't really ideal - I want to see if this seems promising first. + +# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default) +# to 200 in order to reduce computation in the Jesus layer. + +# _3d is as _2y, and re-using the egs, but using --jesus-opts and +# configs from make_jesus_configs.py. +# --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ +# --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" + +# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from +# 800k to 1.2 million. The aim is to avoid some of the per-job overhead +# (model-averaging, etc.), since each iteration takes only a minute or so. +# I added the results to the table below. It seems the same on average- +# which is good. We'll probably keep this configuration. + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# note: when I compare with the rerun of 2o (not shown), this run is actually +# better. +# WER on 2m 2o 2y [ now comparing 2o->2y:] +# train_dev,tg 17.22 17.24 16.99 0.2% better +# train_dev,fg 15.87 15.93 15.86 0.1% better +# eval2000,tg 18.7 18.7 18.9 0.2% worse +# eval2000,fg 17.0 16.9 17.0 0.1% worse + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_3m # Note: _sp will get added to this if $speed_perturb == true. + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=1.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --egs-dir exp/chain/tdnn_2y_sp/egs \ + --jesus-opts "--jesus-forward-input-dim 600 --jesus-forward-output-dim 1500 --jesus-direct-recurrence-dim 1000 --jesus-projected-recurrence-output-dim 600 --jesus-projected-recurrence-input-dim 300 --jesus-hidden-dim 15000 --use-repeated-affine false --jesus-stddev-scale 0.1 --final-layer-learning-rate-factor 0.25" \ + --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" \ + --apply-deriv-weights false \ + --frames-per-iter 1200000 \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --relu-dim 850 \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context 20 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_3n.sh b/egs/swbd/s5c/local/chain/run_tdnn_3n.sh new file mode 100755 index 00000000000..dedbd84be75 --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_3n.sh @@ -0,0 +1,305 @@ +#!/bin/bash + +# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support +# recurrence, with improvements to the learning of the jesus layers. + +# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found +# to be worse. +# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence +# is helpful.] +#./show_wer.sh 3g +#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys +#a03:s5c: ./show_wer.sh 2y +#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +#a03:s5c: ./show_wer.sh 3d +#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0 +#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +# _3f is as _3e, but modifying the splicing setup to add (left) recurrence: +# added the :3's in --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" +# Therefore it's +# no longer really a tdnn, more like an RNN combined with TDNN. BTW, I'm not re-dumping egs with extra +# context, and this isn't really ideal - I want to see if this seems promising first. + +# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default) +# to 200 in order to reduce computation in the Jesus layer. + +# _3d is as _2y, and re-using the egs, but using --jesus-opts and +# configs from make_jesus_configs.py. +# --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ +# --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" + +# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from +# 800k to 1.2 million. The aim is to avoid some of the per-job overhead +# (model-averaging, etc.), since each iteration takes only a minute or so. +# I added the results to the table below. It seems the same on average- +# which is good. We'll probably keep this configuration. + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# note: when I compare with the rerun of 2o (not shown), this run is actually +# better. +# WER on 2m 2o 2y [ now comparing 2o->2y:] +# train_dev,tg 17.22 17.24 16.99 0.2% better +# train_dev,fg 15.87 15.93 15.86 0.1% better +# eval2000,tg 18.7 18.7 18.9 0.2% worse +# eval2000,fg 17.0 16.9 17.0 0.1% worse + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_3n # Note: _sp will get added to this if $speed_perturb == true. + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=1.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --egs-dir exp/chain/tdnn_2y_sp/egs \ + --jesus-opts "--jesus-forward-input-dim 600 --jesus-forward-output-dim 1800 --jesus-hidden-dim 15000 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \ + --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" \ + --apply-deriv-weights false \ + --frames-per-iter 1200000 \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --final-layer-normalize-target $final_layer_normalize_target \ + --relu-dim 850 \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context 20 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_3o.sh b/egs/swbd/s5c/local/chain/run_tdnn_3o.sh new file mode 100755 index 00000000000..14383fe1a32 --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_3o.sh @@ -0,0 +1,309 @@ +#!/bin/bash + +# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2. +# [ seemed helpful based on likelihoods on first iterations]: on iter 42, +# train prob is -0.1554->-0.1523, and valid prob is -0.1559->-0.1540. + +# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support +# recurrence, with improvements to the learning of the jesus layers. + +# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found +# to be worse. +# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence +# is helpful.] +#./show_wer.sh 3g +#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys +#a03:s5c: ./show_wer.sh 2y +#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +#a03:s5c: ./show_wer.sh 3d +#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0 +#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +# _3f is as _3e, but modifying the splicing setup to add (left) recurrence: +# added the :3's in --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" +# Therefore it's +# no longer really a tdnn, more like an RNN combined with TDNN. BTW, I'm not re-dumping egs with extra +# context, and this isn't really ideal - I want to see if this seems promising first. + +# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default) +# to 200 in order to reduce computation in the Jesus layer. + +# _3d is as _2y, and re-using the egs, but using --jesus-opts and +# configs from make_jesus_configs.py. +# --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ +# --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" + +# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from +# 800k to 1.2 million. The aim is to avoid some of the per-job overhead +# (model-averaging, etc.), since each iteration takes only a minute or so. +# I added the results to the table below. It seems the same on average- +# which is good. We'll probably keep this configuration. + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# note: when I compare with the rerun of 2o (not shown), this run is actually +# better. +# WER on 2m 2o 2y [ now comparing 2o->2y:] +# train_dev,tg 17.22 17.24 16.99 0.2% better +# train_dev,fg 15.87 15.93 15.86 0.1% better +# eval2000,tg 18.7 18.7 18.9 0.2% worse +# eval2000,fg 17.0 16.9 17.0 0.1% worse + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_3o # Note: _sp will get added to this if $speed_perturb == true. + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=1.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --egs-dir exp/chain/tdnn_2y_sp/egs \ + --jesus-opts "--jesus-forward-input-dim 600 --jesus-forward-output-dim 1800 --jesus-hidden-dim 15000 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \ + --splice-indexes "-2,-1,0,1,2 -1,0,1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" \ + --apply-deriv-weights false \ + --frames-per-iter 1200000 \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --final-layer-normalize-target $final_layer_normalize_target \ + --relu-dim 850 \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context 20 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_3p.sh b/egs/swbd/s5c/local/chain/run_tdnn_3p.sh new file mode 100755 index 00000000000..ddba7e7f9c5 --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_3p.sh @@ -0,0 +1,333 @@ +#!/bin/bash + +# _3p is the same as 3o, but after a code and script change so we can use +# natural gradient for the RepeatedAffineComponent. +# [natural gradient was helpful, based on logs; +# also made a change to use positive bias for the jesus-component affine parts.] + +# Comparing the WER with 2y, it's about 1% abs worse [see below]. However, this is +# for an odd reason: the model, while smaller than the 2y one (8.8 vs. 12.1 million +# parameters), seems to have a lot more learning capacity, with better train and worse valid +# prob. In 3r and 3s I am trying smaller versions of this architecture. + +# a03:s5c: ./show_wer.sh 3p +# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0 +# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0 +# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys +# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 2y +# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys +# 2y 3p +# final-train-prob: -0.083068 -0.0771 +# final-valid-prob: -0.01212 -0.12715 +# num-parameters: 12094115 8804087 + + +# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2. + +# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support +# recurrence, with improvements to the learning of the jesus layers. + +# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found +# to be worse. +# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence +# is helpful.] +#./show_wer.sh 3g +#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys +#a03:s5c: ./show_wer.sh 2y +#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +#a03:s5c: ./show_wer.sh 3d +#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0 +#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +# _3f is as _3e, but modifying the splicing setup to add (left) recurrence: +# added the :3's in --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" +# Therefore it's +# no longer really a tdnn, more like an RNN combined with TDNN. BTW, I'm not re-dumping egs with extra +# context, and this isn't really ideal - I want to see if this seems promising first. + +# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default) +# to 200 in order to reduce computation in the Jesus layer. + +# _3d is as _2y, and re-using the egs, but using --jesus-opts and +# configs from make_jesus_configs.py. +# --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ +# --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" + +# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from +# 800k to 1.2 million. The aim is to avoid some of the per-job overhead +# (model-averaging, etc.), since each iteration takes only a minute or so. +# I added the results to the table below. It seems the same on average- +# which is good. We'll probably keep this configuration. + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# note: when I compare with the rerun of 2o (not shown), this run is actually +# better. +# WER on 2m 2o 2y [ now comparing 2o->2y:] +# train_dev,tg 17.22 17.24 16.99 0.2% better +# train_dev,fg 15.87 15.93 15.86 0.1% better +# eval2000,tg 18.7 18.7 18.9 0.2% worse +# eval2000,fg 17.0 16.9 17.0 0.1% worse + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_3p # Note: _sp will get added to this if $speed_perturb == true. + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=1.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --egs-dir exp/chain/tdnn_2y_sp/egs \ + --jesus-opts "--jesus-forward-input-dim 600 --jesus-forward-output-dim 1800 --jesus-hidden-dim 15000 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \ + --splice-indexes "-2,-1,0,1,2 -1,0,1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" \ + --apply-deriv-weights false \ + --frames-per-iter 1200000 \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --final-layer-normalize-target $final_layer_normalize_target \ + --relu-dim 850 \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context 20 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_3q.sh b/egs/swbd/s5c/local/chain/run_tdnn_3q.sh new file mode 100755 index 00000000000..9f67164b806 --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_3q.sh @@ -0,0 +1,315 @@ +#!/bin/bash + +# _3q is as _3p, but now trying out the 'block' training script, where in addition to +# the affine connections we have block-matrix connections between the layers. + +# _3p is the same as 3o, but after a code and script change so we can use +# natural gradient for the RepeatedAffineComponent. +# [natural gradient was helpful, based on logs; +# also made a change to use positive bias for the jesus-component affine parts.] + +# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2. + +# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support +# recurrence, with improvements to the learning of the jesus layers. + +# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found +# to be worse. +# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence +# is helpful.] +#./show_wer.sh 3g +#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys +#a03:s5c: ./show_wer.sh 2y +#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +#a03:s5c: ./show_wer.sh 3d +#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0 +#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +# _3f is as _3e, but modifying the splicing setup to add (left) recurrence: +# added the :3's in --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" +# Therefore it's +# no longer really a tdnn, more like an RNN combined with TDNN. BTW, I'm not re-dumping egs with extra +# context, and this isn't really ideal - I want to see if this seems promising first. + +# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default) +# to 200 in order to reduce computation in the Jesus layer. + +# _3d is as _2y, and re-using the egs, but using --jesus-opts and +# configs from make_jesus_configs.py. +# --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ +# --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" + +# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from +# 800k to 1.2 million. The aim is to avoid some of the per-job overhead +# (model-averaging, etc.), since each iteration takes only a minute or so. +# I added the results to the table below. It seems the same on average- +# which is good. We'll probably keep this configuration. + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# note: when I compare with the rerun of 2o (not shown), this run is actually +# better. +# WER on 2m 2o 2y [ now comparing 2o->2y:] +# train_dev,tg 17.22 17.24 16.99 0.2% better +# train_dev,fg 15.87 15.93 15.86 0.1% better +# eval2000,tg 18.7 18.7 18.9 0.2% worse +# eval2000,fg 17.0 16.9 17.0 0.1% worse + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_3q # Note: _sp will get added to this if $speed_perturb == true. + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=1.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --egs-dir exp/chain/tdnn_2y_sp/egs \ + --jesus-block-opts "--jesus-full-output-dim 900 --jesus-full-input-dim 900 --jesus-block-input-dim 900 --jesus-block-output-dim 900 --jesus-hidden-dim 15000 --jesus-final-output-dim 600 --jesus-stddev-scale 0.4 --num-affine-blocks 25 --final-layer-target-rms 0.5" \ + --splice-indexes "-2,-1,0,1,2 -1,0,1,2 -3,0,3 -6,0,3 -6,0,3" \ + --apply-deriv-weights false \ + --frames-per-iter 1200000 \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --final-layer-normalize-target $final_layer_normalize_target \ + --relu-dim 850 \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context 20 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_3r.sh b/egs/swbd/s5c/local/chain/run_tdnn_3r.sh new file mode 100755 index 00000000000..7815adffb9f --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_3r.sh @@ -0,0 +1,321 @@ +#!/bin/bash + +# _3r is as _3p but reducing the number of parameters as it seemed to be +# overtraining (despite already being quite a small model): [600,1800 -> +# 500,1500]. Also in the interim there was a script change to +# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change. +# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent +# with the halving of the minibatch size.] +# [I think I abandoned this after deciding to reduce the parameters even further, +# to the setup in 3s]. + +# _3p is the same as 3o, but after a code and script change so we can use +# natural gradient for the RepeatedAffineComponent. +# [natural gradient was helpful, based on logs; +# also made a change to use positive bias for the jesus-component affine parts.] + +# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2. + +# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support +# recurrence, with improvements to the learning of the jesus layers. + +# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found +# to be worse. +# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence +# is helpful.] +#./show_wer.sh 3g +#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys +#a03:s5c: ./show_wer.sh 2y +#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +#a03:s5c: ./show_wer.sh 3d +#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0 +#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +# _3f is as _3e, but modifying the splicing setup to add (left) recurrence: +# added the :3's in --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" +# Therefore it's +# no longer really a tdnn, more like an RNN combined with TDNN. BTW, I'm not re-dumping egs with extra +# context, and this isn't really ideal - I want to see if this seems promising first. + +# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default) +# to 200 in order to reduce computation in the Jesus layer. + +# _3d is as _2y, and re-using the egs, but using --jesus-opts and +# configs from make_jesus_configs.py. +# --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ +# --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" + +# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from +# 800k to 1.2 million. The aim is to avoid some of the per-job overhead +# (model-averaging, etc.), since each iteration takes only a minute or so. +# I added the results to the table below. It seems the same on average- +# which is good. We'll probably keep this configuration. + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# note: when I compare with the rerun of 2o (not shown), this run is actually +# better. +# WER on 2m 2o 2y [ now comparing 2o->2y:] +# train_dev,tg 17.22 17.24 16.99 0.2% better +# train_dev,fg 15.87 15.93 15.86 0.1% better +# eval2000,tg 18.7 18.7 18.9 0.2% worse +# eval2000,fg 17.0 16.9 17.0 0.1% worse + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_3r # Note: _sp will get added to this if $speed_perturb == true. + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=1.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --egs-dir exp/chain/tdnn_2y_sp/egs \ + --jesus-opts "--jesus-forward-input-dim 500 --jesus-forward-output-dim 1500 --jesus-hidden-dim 15000 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \ + --splice-indexes "-2,-1,0,1,2 -1,0,1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" \ + --apply-deriv-weights false \ + --frames-per-iter 1200000 \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --final-layer-normalize-target $final_layer_normalize_target \ + --relu-dim 850 \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context 20 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_3s.sh b/egs/swbd/s5c/local/chain/run_tdnn_3s.sh new file mode 100755 index 00000000000..6cee8b11925 --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_3s.sh @@ -0,0 +1,340 @@ +#!/bin/bash + +# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400. +# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p. +# This of course reduces overtraining. Results are a bit better than 3p but still +# not as good as 2y + +# ./show_wer.sh 3s +# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 3p +# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0 +# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0 +# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys +# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 2y +# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + + +# _3r is as _3p but reducing the number of parameters as it seemed to be +# overtraining (despite already being quite a small model): [600,1800 -> +# 500,1500]. Also in the interim there was a script change to +# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change. +# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent +# with the halving of the minibatch size.] + + +# _3p is the same as 3o, but after a code and script change so we can use +# natural gradient for the RepeatedAffineComponent. +# [natural gradient was helpful, based on logs; +# also made a change to use positive bias for the jesus-component affine parts.] + +# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2. + +# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support +# recurrence, with improvements to the learning of the jesus layers. + +# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found +# to be worse. +# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence +# is helpful.] +#./show_wer.sh 3g +#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys +#a03:s5c: ./show_wer.sh 2y +#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +#a03:s5c: ./show_wer.sh 3d +#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0 +#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +# _3f is as _3e, but modifying the splicing setup to add (left) recurrence: +# added the :3's in --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" +# Therefore it's +# no longer really a tdnn, more like an RNN combined with TDNN. BTW, I'm not re-dumping egs with extra +# context, and this isn't really ideal - I want to see if this seems promising first. + +# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default) +# to 200 in order to reduce computation in the Jesus layer. + +# _3d is as _2y, and re-using the egs, but using --jesus-opts and +# configs from make_jesus_configs.py. +# --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ +# --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" + +# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from +# 800k to 1.2 million. The aim is to avoid some of the per-job overhead +# (model-averaging, etc.), since each iteration takes only a minute or so. +# I added the results to the table below. It seems the same on average- +# which is good. We'll probably keep this configuration. + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# note: when I compare with the rerun of 2o (not shown), this run is actually +# better. +# WER on 2m 2o 2y [ now comparing 2o->2y:] +# train_dev,tg 17.22 17.24 16.99 0.2% better +# train_dev,fg 15.87 15.93 15.86 0.1% better +# eval2000,tg 18.7 18.7 18.9 0.2% worse +# eval2000,fg 17.0 16.9 17.0 0.1% worse + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_3s # Note: _sp will get added to this if $speed_perturb == true. + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=1.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --egs-dir exp/chain/tdnn_2y_sp/egs \ + --jesus-opts "--jesus-forward-input-dim 400 --jesus-forward-output-dim 1500 --jesus-hidden-dim 15000 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \ + --splice-indexes "-2,-1,0,1,2 -1,0,1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" \ + --apply-deriv-weights false \ + --frames-per-iter 1200000 \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context 20 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_3t.sh b/egs/swbd/s5c/local/chain/run_tdnn_3t.sh new file mode 100755 index 00000000000..25e30900e36 --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_3t.sh @@ -0,0 +1,336 @@ +#!/bin/bash + +# _3t is as _3s but using slightly wider context. Dumping our own egs. +# The final train prob is better -0.0851->-0.0815, but valid prob is worse -0.1231->-0.1243. +# WER is slightly worse. So we won't use this for now, but later if we use more data we +# could try wider context like this. +#a03:s5c: ./show_wer.sh 3s +#%WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# +#%WER 18.01 [ 8860 / 49204, 1043 ins, 2315 del, 5502 sub ] exp/chain/tdnn_3t_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 16.68 [ 8205 / 49204, 930 ins, 2420 del, 4855 sub ] exp/chain/tdnn_3t_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0 +#%WER 19.7 | 4459 42989 | 82.6 11.9 5.5 2.3 19.7 57.4 | exp/chain/tdnn_3t_sp/decode_eval2000_sw1_tg/score_10_0.5/eval2000_hires.ctm.filt.sys +#%WER 17.8 | 4459 42989 | 84.2 10.4 5.4 2.0 17.8 55.4 | exp/chain/tdnn_3t_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys + +# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400. + +# _3r is as _3p but reducing the number of parameters as it seemed to be +# overtraining (despite already being quite a small model): [600,1800 -> +# 500,1500]. Also in the interim there was a script change to +# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change. +# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent +# with the halving of the minibatch size.] + + +# _3p is the same as 3o, but after a code and script change so we can use +# natural gradient for the RepeatedAffineComponent. +# [natural gradient was helpful, based on logs; +# also made a change to use positive bias for the jesus-component affine parts.] + +# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2. + +# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support +# recurrence, with improvements to the learning of the jesus layers. + +# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found +# to be worse. +# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence +# is helpful.] +#./show_wer.sh 3g +#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys +#a03:s5c: ./show_wer.sh 2y +#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +#a03:s5c: ./show_wer.sh 3d +#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0 +#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +# _3f is as _3e, but modifying the splicing setup to add (left) recurrence: +# added the :3's in --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" +# Therefore it's +# no longer really a tdnn, more like an RNN combined with TDNN. BTW, I'm not re-dumping egs with extra +# context, and this isn't really ideal - I want to see if this seems promising first. + +# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default) +# to 200 in order to reduce computation in the Jesus layer. + +# _3d is as _2y, and re-using the egs, but using --jesus-opts and +# configs from make_jesus_configs.py. +# --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ +# --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" + +# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from +# 800k to 1.2 million. The aim is to avoid some of the per-job overhead +# (model-averaging, etc.), since each iteration takes only a minute or so. +# I added the results to the table below. It seems the same on average- +# which is good. We'll probably keep this configuration. + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# note: when I compare with the rerun of 2o (not shown), this run is actually +# better. +# WER on 2m 2o 2y [ now comparing 2o->2y:] +# train_dev,tg 17.22 17.24 16.99 0.2% better +# train_dev,fg 15.87 15.93 15.86 0.1% better +# eval2000,tg 18.7 18.7 18.9 0.2% worse +# eval2000,fg 17.0 16.9 17.0 0.1% worse + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_3t # Note: _sp will get added to this if $speed_perturb == true. + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=1.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{1,2,3,4}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --jesus-opts "--jesus-forward-input-dim 400 --jesus-forward-output-dim 1500 --jesus-hidden-dim 15000 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \ + --splice-indexes "-2,-1,0,1,2 -3,-2,-1,0,1,2,3 -3,0,3 -6,-3,0,3,6 -6,-3,0,3,6" \ + --apply-deriv-weights false \ + --frames-per-iter 1200000 \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --final-layer-normalize-target $final_layer_normalize_target \ + --relu-dim 850 \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context 20 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_3u.sh b/egs/swbd/s5c/local/chain/run_tdnn_3u.sh new file mode 100755 index 00000000000..d1b93d9084c --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_3u.sh @@ -0,0 +1,330 @@ +#!/bin/bash + +# _3u is as _3s (and re-using the egs) but with one more layer; keeping the same dim +# and total context, and reducing --jesus-forward-output-dim from 1500 to 1300 to +# ensure that the number of parameters doesn't increase too much. +# [stopping this run, as the likelihoods weren't promising, e.g. by iteration +# 39, the valid-prob was worse vs. 3t, -0.1488 -> -0.1521 (train: -0.1510 -> -0.1532) + +# _3t is as _3s but using slightly wider context. Dumping our own egs. + +# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400. + +# _3r is as _3p but reducing the number of parameters as it seemed to be +# overtraining (despite already being quite a small model): [600,1800 -> +# 500,1500]. Also in the interim there was a script change to +# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change. +# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent +# with the halving of the minibatch size.] + + +# _3p is the same as 3o, but after a code and script change so we can use +# natural gradient for the RepeatedAffineComponent. +# [natural gradient was helpful, based on logs; +# also made a change to use positive bias for the jesus-component affine parts.] + +# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2. + +# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support +# recurrence, with improvements to the learning of the jesus layers. + +# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found +# to be worse. +# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence +# is helpful.] +#./show_wer.sh 3g +#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys +#a03:s5c: ./show_wer.sh 2y +#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +#a03:s5c: ./show_wer.sh 3d +#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0 +#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +# _3f is as _3e, but modifying the splicing setup to add (left) recurrence: +# added the :3's in --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" +# Therefore it's +# no longer really a tdnn, more like an RNN combined with TDNN. BTW, I'm not re-dumping egs with extra +# context, and this isn't really ideal - I want to see if this seems promising first. + +# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default) +# to 200 in order to reduce computation in the Jesus layer. + +# _3d is as _2y, and re-using the egs, but using --jesus-opts and +# configs from make_jesus_configs.py. +# --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ +# --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" + +# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from +# 800k to 1.2 million. The aim is to avoid some of the per-job overhead +# (model-averaging, etc.), since each iteration takes only a minute or so. +# I added the results to the table below. It seems the same on average- +# which is good. We'll probably keep this configuration. + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# note: when I compare with the rerun of 2o (not shown), this run is actually +# better. +# WER on 2m 2o 2y [ now comparing 2o->2y:] +# train_dev,tg 17.22 17.24 16.99 0.2% better +# train_dev,fg 15.87 15.93 15.86 0.1% better +# eval2000,tg 18.7 18.7 18.9 0.2% worse +# eval2000,fg 17.0 16.9 17.0 0.1% worse + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_3u # Note: _sp will get added to this if $speed_perturb == true. + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=1.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{1,2,3,4}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --egs-dir exp/chain/tdnn_3t_sp/egs \ + --jesus-opts "--jesus-forward-input-dim 400 --jesus-forward-output-dim 1300 --jesus-hidden-dim 15000 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \ + --splice-indexes "-2,-1,0,1,2 -3,-2,-1,0,1,2,3 -3,0,3 -3,0,3 -3,0,3 -6,-3,0,3,6" \ + --apply-deriv-weights false \ + --frames-per-iter 1200000 \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --final-layer-normalize-target $final_layer_normalize_target \ + --relu-dim 850 \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context 20 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_3v.sh b/egs/swbd/s5c/local/chain/run_tdnn_3v.sh new file mode 100755 index 00000000000..c7fcb7e24f5 --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_3v.sh @@ -0,0 +1,328 @@ +#!/bin/bash + +# _3v is as _3t but decreasing the --num-jesus-blocks from 100 to 50. +# I stopped it early after likelihoods were not promising: +# on iter 90, train prob was -0.1226->-0.1240, valid -0.1304->-0.1340. + +# _3t is as _3s but using slightly wider context. Dumping our own egs. + +# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400. + +# _3r is as _3p but reducing the number of parameters as it seemed to be +# overtraining (despite already being quite a small model): [600,1800 -> +# 500,1500]. Also in the interim there was a script change to +# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change. +# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent +# with the halving of the minibatch size.] + + +# _3p is the same as 3o, but after a code and script change so we can use +# natural gradient for the RepeatedAffineComponent. +# [natural gradient was helpful, based on logs; +# also made a change to use positive bias for the jesus-component affine parts.] + +# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2. + +# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support +# recurrence, with improvements to the learning of the jesus layers. + +# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found +# to be worse. +# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence +# is helpful.] +#./show_wer.sh 3g +#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys +#a03:s5c: ./show_wer.sh 2y +#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +#a03:s5c: ./show_wer.sh 3d +#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0 +#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +# _3f is as _3e, but modifying the splicing setup to add (left) recurrence: +# added the :3's in --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" +# Therefore it's +# no longer really a tdnn, more like an RNN combined with TDNN. BTW, I'm not re-dumping egs with extra +# context, and this isn't really ideal - I want to see if this seems promising first. + +# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default) +# to 200 in order to reduce computation in the Jesus layer. + +# _3d is as _2y, and re-using the egs, but using --jesus-opts and +# configs from make_jesus_configs.py. +# --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ +# --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" + +# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from +# 800k to 1.2 million. The aim is to avoid some of the per-job overhead +# (model-averaging, etc.), since each iteration takes only a minute or so. +# I added the results to the table below. It seems the same on average- +# which is good. We'll probably keep this configuration. + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# note: when I compare with the rerun of 2o (not shown), this run is actually +# better. +# WER on 2m 2o 2y [ now comparing 2o->2y:] +# train_dev,tg 17.22 17.24 16.99 0.2% better +# train_dev,fg 15.87 15.93 15.86 0.1% better +# eval2000,tg 18.7 18.7 18.9 0.2% worse +# eval2000,fg 17.0 16.9 17.0 0.1% worse + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_3v # Note: _sp will get added to this if $speed_perturb == true. + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=1.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{1,2,3,4}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --egs-dir exp/chain/tdnn_3t_sp/egs \ + --jesus-opts "--jesus-forward-input-dim 400 --num-jesus-blocks 50 --jesus-forward-output-dim 1500 --jesus-hidden-dim 15000 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \ + --splice-indexes "-2,-1,0,1,2 -3,-2,-1,0,1,2,3 -3,0,3 -6,-3,0,3,6 -6,-3,0,3,6" \ + --apply-deriv-weights false \ + --frames-per-iter 1200000 \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --final-layer-normalize-target $final_layer_normalize_target \ + --relu-dim 850 \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context 20 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_3w.sh b/egs/swbd/s5c/local/chain/run_tdnn_3w.sh new file mode 100755 index 00000000000..e4165e54de6 --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_3w.sh @@ -0,0 +1,332 @@ +#!/bin/bash + +# _3w is as _3t but instead of having a rectangular affine component in each +# layer, making it square (700->600 not 1300->400), and introducing a new script +# option --final-hidden-dim to have something like a bottleneck at the last +# layer, to avoid a blowup in parameters. +# (note: num-params was slightly smaller, 4.8 million vs 5.3 +# I stopped this on iter 65 after likelihoods were not promising: +# on iter 63, train -0.133->-0.138, valid -0.138->-0.141. + +# _3t is as _3s but using slightly wider context. Dumping our own egs. + +# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400. + +# _3r is as _3p but reducing the number of parameters as it seemed to be +# overtraining (despite already being quite a small model): [600,1800 -> +# 500,1500]. Also in the interim there was a script change to +# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change. +# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent +# with the halving of the minibatch size.] + + +# _3p is the same as 3o, but after a code and script change so we can use +# natural gradient for the RepeatedAffineComponent. +# [natural gradient was helpful, based on logs; +# also made a change to use positive bias for the jesus-component affine parts.] + +# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2. + +# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support +# recurrence, with improvements to the learning of the jesus layers. + +# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found +# to be worse. +# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence +# is helpful.] +#./show_wer.sh 3g +#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys +#a03:s5c: ./show_wer.sh 2y +#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +#a03:s5c: ./show_wer.sh 3d +#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0 +#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +# _3f is as _3e, but modifying the splicing setup to add (left) recurrence: +# added the :3's in --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" +# Therefore it's +# no longer really a tdnn, more like an RNN combined with TDNN. BTW, I'm not re-dumping egs with extra +# context, and this isn't really ideal - I want to see if this seems promising first. + +# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default) +# to 200 in order to reduce computation in the Jesus layer. + +# _3d is as _2y, and re-using the egs, but using --jesus-opts and +# configs from make_jesus_configs.py. +# --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ +# --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" + +# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from +# 800k to 1.2 million. The aim is to avoid some of the per-job overhead +# (model-averaging, etc.), since each iteration takes only a minute or so. +# I added the results to the table below. It seems the same on average- +# which is good. We'll probably keep this configuration. + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# note: when I compare with the rerun of 2o (not shown), this run is actually +# better. +# WER on 2m 2o 2y [ now comparing 2o->2y:] +# train_dev,tg 17.22 17.24 16.99 0.2% better +# train_dev,fg 15.87 15.93 15.86 0.1% better +# eval2000,tg 18.7 18.7 18.9 0.2% worse +# eval2000,fg 17.0 16.9 17.0 0.1% worse + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_3w # Note: _sp will get added to this if $speed_perturb == true. + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=1.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{1,2,3,4}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --egs-dir exp/chain/tdnn_3t_sp/egs \ + --jesus-opts "--jesus-forward-input-dim 600 --jesus-forward-output-dim 800 --final-hidden-dim 400 --jesus-hidden-dim 15000 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \ + --splice-indexes "-2,-1,0,1,2 -3,-2,-1,0,1,2,3 -3,0,3 -6,-3,0,3,6 -6,-3,0,3,6" \ + --apply-deriv-weights false \ + --frames-per-iter 1200000 \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --final-layer-normalize-target $final_layer_normalize_target \ + --relu-dim 850 \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context 20 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_3x.sh b/egs/swbd/s5c/local/chain/run_tdnn_3x.sh new file mode 100755 index 00000000000..1585d209a93 --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_3x.sh @@ -0,0 +1,341 @@ +#!/bin/bash + +# _3x is as _3s (and continuing the same kind of experimentation as in 3t->3w)... +# increasing --jesus-forward-output-dim from 1500 to 2000. +# More overtraining: final-train -0.0852->-0.0799, final-valid -0.1231->-0.1261, +# WER effect is very tiny but maybe slightly better. +#a03:s5c: ./show_wer.sh 3x +#%WER 17.78 [ 8750 / 49204, 910 ins, 2405 del, 5435 sub ] exp/chain/tdnn_3x_sp/decode_train_dev_sw1_tg/wer_12_0.0 +#%WER 16.60 [ 8166 / 49204, 921 ins, 2290 del, 4955 sub ] exp/chain/tdnn_3x_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0 +#%WER 19.5 | 4459 42989 | 82.7 11.4 5.9 2.2 19.5 57.5 | exp/chain/tdnn_3x_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.7 | 4459 42989 | 84.3 10.3 5.5 1.9 17.7 54.6 | exp/chain/tdnn_3x_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +#a03:s5c: ./show_wer.sh 3s +#%WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys + + + +# _3t is as _3s but using slightly wider context. Dumping our own egs. + +# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400. + +# _3r is as _3p but reducing the number of parameters as it seemed to be +# overtraining (despite already being quite a small model): [600,1800 -> +# 500,1500]. Also in the interim there was a script change to +# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change. +# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent +# with the halving of the minibatch size.] + + +# _3p is the same as 3o, but after a code and script change so we can use +# natural gradient for the RepeatedAffineComponent. +# [natural gradient was helpful, based on logs; +# also made a change to use positive bias for the jesus-component affine parts.] + +# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2. + +# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support +# recurrence, with improvements to the learning of the jesus layers. + +# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found +# to be worse. +# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence +# is helpful.] +#./show_wer.sh 3g +#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys +#a03:s5c: ./show_wer.sh 2y +#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +#a03:s5c: ./show_wer.sh 3d +#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0 +#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +# _3f is as _3e, but modifying the splicing setup to add (left) recurrence: +# added the :3's in --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" +# Therefore it's +# no longer really a tdnn, more like an RNN combined with TDNN. BTW, I'm not re-dumping egs with extra +# context, and this isn't really ideal - I want to see if this seems promising first. + +# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default) +# to 200 in order to reduce computation in the Jesus layer. + +# _3d is as _2y, and re-using the egs, but using --jesus-opts and +# configs from make_jesus_configs.py. +# --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ +# --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" + +# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from +# 800k to 1.2 million. The aim is to avoid some of the per-job overhead +# (model-averaging, etc.), since each iteration takes only a minute or so. +# I added the results to the table below. It seems the same on average- +# which is good. We'll probably keep this configuration. + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# note: when I compare with the rerun of 2o (not shown), this run is actually +# better. +# WER on 2m 2o 2y [ now comparing 2o->2y:] +# train_dev,tg 17.22 17.24 16.99 0.2% better +# train_dev,fg 15.87 15.93 15.86 0.1% better +# eval2000,tg 18.7 18.7 18.9 0.2% worse +# eval2000,fg 17.0 16.9 17.0 0.1% worse + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_3x # Note: _sp will get added to this if $speed_perturb == true. + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=1.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{1,2,3,4}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --egs-dir exp/chain/tdnn_3t_sp/egs \ + --jesus-opts "--jesus-forward-input-dim 400 --jesus-forward-output-dim 2000 --final-hidden-dim 350 --jesus-hidden-dim 15000 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \ + --splice-indexes "-2,-1,0,1,2 -1,0,1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" \ + --apply-deriv-weights false \ + --frames-per-iter 1200000 \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --final-layer-normalize-target $final_layer_normalize_target \ + --relu-dim 850 \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context 20 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_3y.sh b/egs/swbd/s5c/local/chain/run_tdnn_3y.sh new file mode 100755 index 00000000000..042ec84898b --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_3y.sh @@ -0,0 +1,346 @@ +#!/bin/bash + +# _3y is as _3s but doubling jesus-hidden-dim from 15000 to 30000. +# not promising: by iteration 228, train prob changed -0.09583->-0.09575, and +# valid prob from -0.1213 -> -0.1239. Killed it. + +# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400. +# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p. +# This of course reduces overtraining. Results are a bit better than 3p but still +# not as good as 3s. + +# ./show_wer.sh 3s +# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 3p +# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0 +# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0 +# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys +# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 2y +# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + + +# _3r is as _3p but reducing the number of parameters as it seemed to be +# overtraining (despite already being quite a small model): [600,1800 -> +# 500,1500]. Also in the interim there was a script change to +# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change. +# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent +# with the halving of the minibatch size.] + + +# _3p is the same as 3o, but after a code and script change so we can use +# natural gradient for the RepeatedAffineComponent. +# [natural gradient was helpful, based on logs; +# also made a change to use positive bias for the jesus-component affine parts.] + +# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2. + +# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support +# recurrence, with improvements to the learning of the jesus layers. + +# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found +# to be worse. +# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence +# is helpful.] +#./show_wer.sh 3g +#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys +#a03:s5c: ./show_wer.sh 2y +#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +#a03:s5c: ./show_wer.sh 3d +#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0 +#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +# _3f is as _3e, but modifying the splicing setup to add (left) recurrence: +# added the :3's in --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" +# Therefore it's +# no longer really a tdnn, more like an RNN combined with TDNN. BTW, I'm not re-dumping egs with extra +# context, and this isn't really ideal - I want to see if this seems promising first. + +# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default) +# to 200 in order to reduce computation in the Jesus layer. + +# _3d is as _2y, and re-using the egs, but using --jesus-opts and +# configs from make_jesus_configs.py. +# --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ +# --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" + +# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from +# 800k to 1.2 million. The aim is to avoid some of the per-job overhead +# (model-averaging, etc.), since each iteration takes only a minute or so. +# I added the results to the table below. It seems the same on average- +# which is good. We'll probably keep this configuration. + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# note: when I compare with the rerun of 2o (not shown), this run is actually +# better. +# WER on 2m 2o 2y [ now comparing 2o->2y:] +# train_dev,tg 17.22 17.24 16.99 0.2% better +# train_dev,fg 15.87 15.93 15.86 0.1% better +# eval2000,tg 18.7 18.7 18.9 0.2% worse +# eval2000,fg 17.0 16.9 17.0 0.1% worse + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_3y # Note: _sp will get added to this if $speed_perturb == true. + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=1.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --egs-dir exp/chain/tdnn_2y_sp/egs \ + --jesus-opts "--jesus-forward-input-dim 400 --jesus-forward-output-dim 1500 --jesus-hidden-dim 30000 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \ + --splice-indexes "-2,-1,0,1,2 -1,0,1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" \ + --apply-deriv-weights false \ + --frames-per-iter 1200000 \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --final-layer-normalize-target $final_layer_normalize_target \ + --relu-dim 850 \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context 20 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_3z.sh b/egs/swbd/s5c/local/chain/run_tdnn_3z.sh new file mode 100755 index 00000000000..f1fa2c5a45e --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_3z.sh @@ -0,0 +1,350 @@ +#!/bin/bash + +# _3z is as _3s, but reducing the target num-states in the tree building from 9k to 6k. +# A slight degradation in WER, but it's not 100% consistent. The final train-prob +# was worse -0.0852 -> -0.0888, and valid-prob was worse -0.1231->-0.1280. +#./show_wer.sh 3z +#%WER 18.05 [ 8883 / 49204, 990 ins, 2397 del, 5496 sub ] exp/chain/tdnn_3z_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 16.50 [ 8120 / 49204, 960 ins, 2234 del, 4926 sub ] exp/chain/tdnn_3z_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 19.7 | 4459 42989 | 82.5 11.9 5.5 2.2 19.7 57.6 | exp/chain/tdnn_3z_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.8 | 4459 42989 | 84.1 10.4 5.5 1.9 17.8 55.1 | exp/chain/tdnn_3z_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys + +# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400. +# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p. +# This of course reduces overtraining. Results are a bit better than 3p but still +# not as good as 2y + +# ./show_wer.sh 3s +# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 3p +# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0 +# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0 +# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys +# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 2y +# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + + +# _3r is as _3p but reducing the number of parameters as it seemed to be +# overtraining (despite already being quite a small model): [600,1800 -> +# 500,1500]. Also in the interim there was a script change to +# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change. +# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent +# with the halving of the minibatch size.] + + +# _3p is the same as 3o, but after a code and script change so we can use +# natural gradient for the RepeatedAffineComponent. +# [natural gradient was helpful, based on logs; +# also made a change to use positive bias for the jesus-component affine parts.] + +# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2. + +# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support +# recurrence, with improvements to the learning of the jesus layers. + +# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found +# to be worse. +# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence +# is helpful.] +#./show_wer.sh 3g +#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys +#a03:s5c: ./show_wer.sh 2y +#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +#a03:s5c: ./show_wer.sh 3d +#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0 +#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +# _3f is as _3e, but modifying the splicing setup to add (left) recurrence: +# added the :3's in --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" +# Therefore it's +# no longer really a tdnn, more like an RNN combined with TDNN. BTW, I'm not re-dumping egs with extra +# context, and this isn't really ideal - I want to see if this seems promising first. + +# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default) +# to 200 in order to reduce computation in the Jesus layer. + +# _3d is as _2y, and re-using the egs, but using --jesus-opts and +# configs from make_jesus_configs.py. +# --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ +# --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" + +# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from +# 800k to 1.2 million. The aim is to avoid some of the per-job overhead +# (model-averaging, etc.), since each iteration takes only a minute or so. +# I added the results to the table below. It seems the same on average- +# which is good. We'll probably keep this configuration. + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# note: when I compare with the rerun of 2o (not shown), this run is actually +# better. +# WER on 2m 2o 2y [ now comparing 2o->2y:] +# train_dev,tg 17.22 17.24 16.99 0.2% better +# train_dev,fg 15.87 15.93 15.86 0.1% better +# eval2000,tg 18.7 18.7 18.9 0.2% worse +# eval2000,fg 17.0 16.9 17.0 0.1% worse + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=11 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_3z # Note: _sp will get added to this if $speed_perturb == true. + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=1.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 6000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --jesus-opts "--jesus-forward-input-dim 400 --jesus-forward-output-dim 1500 --jesus-hidden-dim 15000 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \ + --splice-indexes "-2,-1,0,1,2 -1,0,1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" \ + --apply-deriv-weights false \ + --frames-per-iter 1200000 \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --final-layer-normalize-target $final_layer_normalize_target \ + --relu-dim 850 \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context 20 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_4a.sh b/egs/swbd/s5c/local/chain/run_tdnn_4a.sh new file mode 100755 index 00000000000..c02ad2cb0e4 --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_4a.sh @@ -0,0 +1,349 @@ +#!/bin/bash + +# _4a is as _3s, but using narrower splice-indexes in the first layer. +# WER is maybe a fraction worse than 3s (see below); final train prob is +# worse -0->0852 -> -0.0879, and valid prob is better -0.121 ->-0.1213 +#./show_wer.sh 4a +#%WER 17.88 [ 8800 / 49204, 1017 ins, 2233 del, 5550 sub ] exp/chain/tdnn_4a_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 16.73 [ 8231 / 49204, 898 ins, 2397 del, 4936 sub ] exp/chain/tdnn_4a_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0 +#%WER 19.7 | 4459 42989 | 82.5 12.0 5.5 2.3 19.7 57.6 | exp/chain/tdnn_4a_sp/decode_eval2000_sw1_tg/score_10_0.5/eval2000_hires.ctm.filt.sys +#%WER 17.8 | 4459 42989 | 84.2 10.3 5.5 2.0 17.8 55.1 | exp/chain/tdnn_4a_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys + +# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400. +# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p. +# This of course reduces overtraining. Results are a bit better than 3p but still +# not as good as 2y + +# ./show_wer.sh 3s +# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 3p +# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0 +# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0 +# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys +# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 2y +# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + + +# _3r is as _3p but reducing the number of parameters as it seemed to be +# overtraining (despite already being quite a small model): [600,1800 -> +# 500,1500]. Also in the interim there was a script change to +# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change. +# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent +# with the halving of the minibatch size.] + + +# _3p is the same as 3o, but after a code and script change so we can use +# natural gradient for the RepeatedAffineComponent. +# [natural gradient was helpful, based on logs; +# also made a change to use positive bias for the jesus-component affine parts.] + +# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2. + +# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support +# recurrence, with improvements to the learning of the jesus layers. + +# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found +# to be worse. +# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence +# is helpful.] +#./show_wer.sh 3g +#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys +#a03:s5c: ./show_wer.sh 2y +#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +#a03:s5c: ./show_wer.sh 3d +#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0 +#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +# _3f is as _3e, but modifying the splicing setup to add (left) recurrence: +# added the :3's in --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" +# Therefore it's +# no longer really a tdnn, more like an RNN combined with TDNN. BTW, I'm not re-dumping egs with extra +# context, and this isn't really ideal - I want to see if this seems promising first. + +# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default) +# to 200 in order to reduce computation in the Jesus layer. + +# _3d is as _2y, and re-using the egs, but using --jesus-opts and +# configs from make_jesus_configs.py. +# --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ +# --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" + +# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from +# 800k to 1.2 million. The aim is to avoid some of the per-job overhead +# (model-averaging, etc.), since each iteration takes only a minute or so. +# I added the results to the table below. It seems the same on average- +# which is good. We'll probably keep this configuration. + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# note: when I compare with the rerun of 2o (not shown), this run is actually +# better. +# WER on 2m 2o 2y [ now comparing 2o->2y:] +# train_dev,tg 17.22 17.24 16.99 0.2% better +# train_dev,fg 15.87 15.93 15.86 0.1% better +# eval2000,tg 18.7 18.7 18.9 0.2% worse +# eval2000,fg 17.0 16.9 17.0 0.1% worse + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_4a # Note: _sp will get added to this if $speed_perturb == true. + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=1.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --egs-dir exp/chain/tdnn_2y_sp/egs \ + --jesus-opts "--jesus-forward-input-dim 400 --jesus-forward-output-dim 1500 --jesus-hidden-dim 15000 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \ + --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" \ + --apply-deriv-weights false \ + --frames-per-iter 1200000 \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context 20 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_4b.sh b/egs/swbd/s5c/local/chain/run_tdnn_4b.sh new file mode 100755 index 00000000000..aad278c3037 --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_4b.sh @@ -0,0 +1,346 @@ +#!/bin/bash + +# _4b is as _4a, but even narrower splice-indexes in 1st layer (no splicing) +# stopped early after train and valid likelihoods were not promising. +# [later accidentally overwrote and moved the dir.] + +# _4a is as _3s, but using narrower splice-indexes in the first layer. + +# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400. +# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p. +# This of course reduces overtraining. Results are a bit better than 3p but still +# not as good as 2y + +# ./show_wer.sh 3s +# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 3p +# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0 +# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0 +# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys +# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 2y +# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + + +# _3r is as _3p but reducing the number of parameters as it seemed to be +# overtraining (despite already being quite a small model): [600,1800 -> +# 500,1500]. Also in the interim there was a script change to +# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change. +# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent +# with the halving of the minibatch size.] + + +# _3p is the same as 3o, but after a code and script change so we can use +# natural gradient for the RepeatedAffineComponent. +# [natural gradient was helpful, based on logs; +# also made a change to use positive bias for the jesus-component affine parts.] + +# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2. + +# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support +# recurrence, with improvements to the learning of the jesus layers. + +# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found +# to be worse. +# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence +# is helpful.] +#./show_wer.sh 3g +#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys +#a03:s5c: ./show_wer.sh 2y +#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +#a03:s5c: ./show_wer.sh 3d +#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0 +#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +# _3f is as _3e, but modifying the splicing setup to add (left) recurrence: +# added the :3's in --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" +# Therefore it's +# no longer really a tdnn, more like an RNN combined with TDNN. BTW, I'm not re-dumping egs with extra +# context, and this isn't really ideal - I want to see if this seems promising first. + +# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default) +# to 200 in order to reduce computation in the Jesus layer. + +# _3d is as _2y, and re-using the egs, but using --jesus-opts and +# configs from make_jesus_configs.py. +# --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ +# --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" + +# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from +# 800k to 1.2 million. The aim is to avoid some of the per-job overhead +# (model-averaging, etc.), since each iteration takes only a minute or so. +# I added the results to the table below. It seems the same on average- +# which is good. We'll probably keep this configuration. + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# note: when I compare with the rerun of 2o (not shown), this run is actually +# better. +# WER on 2m 2o 2y [ now comparing 2o->2y:] +# train_dev,tg 17.22 17.24 16.99 0.2% better +# train_dev,fg 15.87 15.93 15.86 0.1% better +# eval2000,tg 18.7 18.7 18.9 0.2% worse +# eval2000,fg 17.0 16.9 17.0 0.1% worse + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_4b # Note: _sp will get added to this if $speed_perturb == true. + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=1.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --egs-dir exp/chain/tdnn_2y_sp/egs \ + --jesus-opts "--jesus-forward-input-dim 400 --jesus-forward-output-dim 1500 --jesus-hidden-dim 15000 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \ + --splice-indexes "0 -1,0,1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" \ + --apply-deriv-weights false \ + --frames-per-iter 1200000 \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context 20 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_4c.sh b/egs/swbd/s5c/local/chain/run_tdnn_4c.sh new file mode 100755 index 00000000000..d9060251844 --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_4c.sh @@ -0,0 +1,357 @@ +#!/bin/bash + +# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000. +# Yay-- WER is slightly better or the same. Final train-prob is worse +# -0.0879 -> -0.0882, and valid-prob worse -0.1213 -> -0.1241. + +# %WER 17.63 [ 8673 / 49204, 956 ins, 2334 del, 5383 sub ] exp/chain/tdnn_4c_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 16.61 [ 8175 / 49204, 964 ins, 2272 del, 4939 sub ] exp/chain/tdnn_4c_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 19.7 | 4459 42989 | 82.6 11.8 5.6 2.3 19.7 57.4 | exp/chain/tdnn_4c_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.8 | 4459 42989 | 84.2 10.6 5.2 2.0 17.8 54.4 | exp/chain/tdnn_4c_sp/decode_eval2000_sw1_fsh_fg/score_10_1.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 4a +# %WER 17.88 [ 8800 / 49204, 1017 ins, 2233 del, 5550 sub ] exp/chain/tdnn_4a_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 16.73 [ 8231 / 49204, 898 ins, 2397 del, 4936 sub ] exp/chain/tdnn_4a_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0 +# %WER 19.7 | 4459 42989 | 82.5 12.0 5.5 2.3 19.7 57.6 | exp/chain/tdnn_4a_sp/decode_eval2000_sw1_tg/score_10_0.5/eval2000_hires.ctm.filt.sys +# %WER 17.8 | 4459 42989 | 84.2 10.3 5.5 2.0 17.8 55.1 | exp/chain/tdnn_4a_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys + + +# _4a is as _3s, but using narrower splice-indexes in the first layer. + +# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400. +# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p. +# This of course reduces overtraining. Results are a bit better than 3p but still +# not as good as 2y + +# ./show_wer.sh 3s +# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 3p +# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0 +# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0 +# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys +# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 2y +# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + + +# _3r is as _3p but reducing the number of parameters as it seemed to be +# overtraining (despite already being quite a small model): [600,1800 -> +# 500,1500]. Also in the interim there was a script change to +# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change. +# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent +# with the halving of the minibatch size.] + + +# _3p is the same as 3o, but after a code and script change so we can use +# natural gradient for the RepeatedAffineComponent. +# [natural gradient was helpful, based on logs; +# also made a change to use positive bias for the jesus-component affine parts.] + +# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2. + +# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support +# recurrence, with improvements to the learning of the jesus layers. + +# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found +# to be worse. +# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence +# is helpful.] +#./show_wer.sh 3g +#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys +#a03:s5c: ./show_wer.sh 2y +#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +#a03:s5c: ./show_wer.sh 3d +#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0 +#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +# _3f is as _3e, but modifying the splicing setup to add (left) recurrence: +# added the :3's in --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" +# Therefore it's +# no longer really a tdnn, more like an RNN combined with TDNN. BTW, I'm not re-dumping egs with extra +# context, and this isn't really ideal - I want to see if this seems promising first. + +# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default) +# to 200 in order to reduce computation in the Jesus layer. + +# _3d is as _2y, and re-using the egs, but using --jesus-opts and +# configs from make_jesus_configs.py. +# --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ +# --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" + +# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from +# 800k to 1.2 million. The aim is to avoid some of the per-job overhead +# (model-averaging, etc.), since each iteration takes only a minute or so. +# I added the results to the table below. It seems the same on average- +# which is good. We'll probably keep this configuration. + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# note: when I compare with the rerun of 2o (not shown), this run is actually +# better. +# WER on 2m 2o 2y [ now comparing 2o->2y:] +# train_dev,tg 17.22 17.24 16.99 0.2% better +# train_dev,fg 15.87 15.93 15.86 0.1% better +# eval2000,tg 18.7 18.7 18.9 0.2% worse +# eval2000,fg 17.0 16.9 17.0 0.1% worse + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_4c # Note: _sp will get added to this if $speed_perturb == true. + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=1.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --egs-dir exp/chain/tdnn_2y_sp/egs \ + --jesus-opts "--jesus-forward-input-dim 400 --jesus-forward-output-dim 1500 --jesus-hidden-dim 7500 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \ + --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" \ + --apply-deriv-weights false \ + --frames-per-iter 1200000 \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context 20 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_4d.sh b/egs/swbd/s5c/local/chain/run_tdnn_4d.sh new file mode 100755 index 00000000000..1ae220dc21a --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_4d.sh @@ -0,0 +1,346 @@ +#!/bin/bash + +# _4d is as _4a, but with --egs-opts "--frames-overlap-per-eg 10 +# --cut-zero-frames 5" and changing apply-deriv-weights to true... this to +# activate the new-style derivative weights. + +# _4a is as _3s, but using narrower splice-indexes in the first layer. + +# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400. +# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p. +# This of course reduces overtraining. Results are a bit better than 3p but still +# not as good as 2y + +# ./show_wer.sh 3s +# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 3p +# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0 +# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0 +# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys +# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 2y +# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + + +# _3r is as _3p but reducing the number of parameters as it seemed to be +# overtraining (despite already being quite a small model): [600,1800 -> +# 500,1500]. Also in the interim there was a script change to +# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change. +# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent +# with the halving of the minibatch size.] + + +# _3p is the same as 3o, but after a code and script change so we can use +# natural gradient for the RepeatedAffineComponent. +# [natural gradient was helpful, based on logs; +# also made a change to use positive bias for the jesus-component affine parts.] + +# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2. + +# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support +# recurrence, with improvements to the learning of the jesus layers. + +# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found +# to be worse. +# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence +# is helpful.] +#./show_wer.sh 3g +#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys +#a03:s5c: ./show_wer.sh 2y +#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +#a03:s5c: ./show_wer.sh 3d +#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0 +#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +# _3f is as _3e, but modifying the splicing setup to add (left) recurrence: +# added the :3's in --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" +# Therefore it's +# no longer really a tdnn, more like an RNN combined with TDNN. BTW, I'm not re-dumping egs with extra +# context, and this isn't really ideal - I want to see if this seems promising first. + +# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default) +# to 200 in order to reduce computation in the Jesus layer. + +# _3d is as _2y, and re-using the egs, but using --jesus-opts and +# configs from make_jesus_configs.py. +# --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ +# --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" + +# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from +# 800k to 1.2 million. The aim is to avoid some of the per-job overhead +# (model-averaging, etc.), since each iteration takes only a minute or so. +# I added the results to the table below. It seems the same on average- +# which is good. We'll probably keep this configuration. + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# note: when I compare with the rerun of 2o (not shown), this run is actually +# better. +# WER on 2m 2o 2y [ now comparing 2o->2y:] +# train_dev,tg 17.22 17.24 16.99 0.2% better +# train_dev,fg 15.87 15.93 15.86 0.1% better +# eval2000,tg 18.7 18.7 18.9 0.2% worse +# eval2000,fg 17.0 16.9 17.0 0.1% worse + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_4d # Note: _sp will get added to this if $speed_perturb == true. + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=1.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --egs-dir exp/chain/tdnn_2y_sp/egs \ + --jesus-opts "--jesus-forward-input-dim 400 --jesus-forward-output-dim 1500 --jesus-hidden-dim 15000 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \ + --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" \ + --apply-deriv-weights true \ + --frames-per-iter 1200000 \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 10 --cut-zero-frames 5" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context 20 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_4e.sh b/egs/swbd/s5c/local/chain/run_tdnn_4e.sh new file mode 100755 index 00000000000..fea5495ee06 --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_4e.sh @@ -0,0 +1,362 @@ +#!/bin/bash + +# _4e is as _4c, but adding the option --l2-regularize 0.0001. +# big improvement- about 0.7% WER abs. Considering the non-l2 part of the objf, the +# final valid objf c->e is -0.1241->-0.1266 [and the l2 term is -0.0196]. +# and for the training st it's -0.08820 -> -0.1149. + + +# a03:s5c: ./show_wer.sh 4e +# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0 +# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0 +# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 4c +# %WER 17.63 [ 8673 / 49204, 956 ins, 2334 del, 5383 sub ] exp/chain/tdnn_4c_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 16.61 [ 8175 / 49204, 964 ins, 2272 del, 4939 sub ] exp/chain/tdnn_4c_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 19.7 | 4459 42989 | 82.6 11.8 5.6 2.3 19.7 57.4 | exp/chain/tdnn_4c_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.8 | 4459 42989 | 84.2 10.6 5.2 2.0 17.8 54.4 | exp/chain/tdnn_4c_sp/decode_eval2000_sw1_fsh_fg/score_10_1.0/eval2000_hires.ctm.filt.sys + +# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000. + +# _4a is as _3s, but using narrower splice-indexes in the first layer. + +# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400. +# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p. +# This of course reduces overtraining. Results are a bit better than 3p but still +# not as good as 2y + +# ./show_wer.sh 3s +# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 3p +# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0 +# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0 +# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys +# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 2y +# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + + +# _3r is as _3p but reducing the number of parameters as it seemed to be +# overtraining (despite already being quite a small model): [600,1800 -> +# 500,1500]. Also in the interim there was a script change to +# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change. +# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent +# with the halving of the minibatch size.] + + +# _3p is the same as 3o, but after a code and script change so we can use +# natural gradient for the RepeatedAffineComponent. +# [natural gradient was helpful, based on logs; +# also made a change to use positive bias for the jesus-component affine parts.] + +# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2. + +# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support +# recurrence, with improvements to the learning of the jesus layers. + +# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found +# to be worse. +# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence +# is helpful.] +#./show_wer.sh 3g +#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys +#a03:s5c: ./show_wer.sh 2y +#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +#a03:s5c: ./show_wer.sh 3d +#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0 +#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +# _3f is as _3e, but modifying the splicing setup to add (left) recurrence: +# added the :3's in --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" +# Therefore it's +# no longer really a tdnn, more like an RNN combined with TDNN. BTW, I'm not re-dumping egs with extra +# context, and this isn't really ideal - I want to see if this seems promising first. + +# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default) +# to 200 in order to reduce computation in the Jesus layer. + +# _3d is as _2y, and re-using the egs, but using --jesus-opts and +# configs from make_jesus_configs.py. +# --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ +# --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" + +# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from +# 800k to 1.2 million. The aim is to avoid some of the per-job overhead +# (model-averaging, etc.), since each iteration takes only a minute or so. +# I added the results to the table below. It seems the same on average- +# which is good. We'll probably keep this configuration. + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# note: when I compare with the rerun of 2o (not shown), this run is actually +# better. +# WER on 2m 2o 2y [ now comparing 2o->2y:] +# train_dev,tg 17.22 17.24 16.99 0.2% better +# train_dev,fg 15.87 15.93 15.86 0.1% better +# eval2000,tg 18.7 18.7 18.9 0.2% worse +# eval2000,fg 17.0 16.9 17.0 0.1% worse + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_4e # Note: _sp will get added to this if $speed_perturb == true. + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=1.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --l2-regularize 0.0001 \ + --egs-dir exp/chain/tdnn_2y_sp/egs \ + --jesus-opts "--jesus-forward-input-dim 400 --jesus-forward-output-dim 1500 --jesus-hidden-dim 7500 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \ + --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" \ + --apply-deriv-weights false \ + --frames-per-iter 1200000 \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context 20 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_4f.sh b/egs/swbd/s5c/local/chain/run_tdnn_4f.sh new file mode 100755 index 00000000000..36d5f188c56 --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_4f.sh @@ -0,0 +1,366 @@ +#!/bin/bash + +# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005. + +# It's even better than 4e, by about 0.3% abs. +# 4c 4e 4f +# Final valid prob: -0.1241 -0.1267 -0.1230 +# Final train prob: -0.08820 -0.1149 -0.1058 + +# ./show_wer.sh 4f +# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0 +# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 4e +# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0 +# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0 +# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys + + +# _4e is as _4c, but adding the option --l2-regularize 0.0001. + +# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000. + +# _4a is as _3s, but using narrower splice-indexes in the first layer. + +# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400. +# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p. +# This of course reduces overtraining. Results are a bit better than 3p but still +# not as good as 2y + +# ./show_wer.sh 3s +# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 3p +# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0 +# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0 +# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys +# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 2y +# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + + +# _3r is as _3p but reducing the number of parameters as it seemed to be +# overtraining (despite already being quite a small model): [600,1800 -> +# 500,1500]. Also in the interim there was a script change to +# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change. +# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent +# with the halving of the minibatch size.] + + +# _3p is the same as 3o, but after a code and script change so we can use +# natural gradient for the RepeatedAffineComponent. +# [natural gradient was helpful, based on logs; +# also made a change to use positive bias for the jesus-component affine parts.] + +# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2. + +# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support +# recurrence, with improvements to the learning of the jesus layers. + +# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found +# to be worse. +# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence +# is helpful.] +#./show_wer.sh 3g +#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys +#a03:s5c: ./show_wer.sh 2y +#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +#a03:s5c: ./show_wer.sh 3d +#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0 +#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +# _3f is as _3e, but modifying the splicing setup to add (left) recurrence: +# added the :3's in --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" +# Therefore it's +# no longer really a tdnn, more like an RNN combined with TDNN. BTW, I'm not re-dumping egs with extra +# context, and this isn't really ideal - I want to see if this seems promising first. + +# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default) +# to 200 in order to reduce computation in the Jesus layer. + +# _3d is as _2y, and re-using the egs, but using --jesus-opts and +# configs from make_jesus_configs.py. +# --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ +# --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" + +# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from +# 800k to 1.2 million. The aim is to avoid some of the per-job overhead +# (model-averaging, etc.), since each iteration takes only a minute or so. +# I added the results to the table below. It seems the same on average- +# which is good. We'll probably keep this configuration. + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# note: when I compare with the rerun of 2o (not shown), this run is actually +# better. +# WER on 2m 2o 2y [ now comparing 2o->2y:] +# train_dev,tg 17.22 17.24 16.99 0.2% better +# train_dev,fg 15.87 15.93 15.86 0.1% better +# eval2000,tg 18.7 18.7 18.9 0.2% worse +# eval2000,fg 17.0 16.9 17.0 0.1% worse + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_4f # Note: _sp will get added to this if $speed_perturb == true. + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=1.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --l2-regularize 0.00005 \ + --egs-dir exp/chain/tdnn_2y_sp/egs \ + --jesus-opts "--jesus-forward-input-dim 400 --jesus-forward-output-dim 1500 --jesus-hidden-dim 7500 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \ + --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" \ + --apply-deriv-weights false \ + --frames-per-iter 1200000 \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context 20 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_4g.sh b/egs/swbd/s5c/local/chain/run_tdnn_4g.sh new file mode 100755 index 00000000000..430c6c28c70 --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_4g.sh @@ -0,0 +1,365 @@ +#!/bin/bash + +# _4g is as _4c, but reducing the --jesus-hidden-dim further from 7500 to 4000. +# Strangely, the trend from 4a->4a does not continue: instead of continuing to get worse, +# the train and valid probs both get better. + +# 4a 4c 4g +# Final train prob: -0.0879 -0.08820 -0.08784 +# Final valid prob: -0.1214 -0.1241 -0.1204 + +# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000. +# Yay-- WER is slightly better or the same. Final train-prob is worse +# -0.0879 -> -0.0882, and valid-prob worse -0.1213 -> -0.1241. + +# %WER 17.63 [ 8673 / 49204, 956 ins, 2334 del, 5383 sub ] exp/chain/tdnn_4c_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 16.61 [ 8175 / 49204, 964 ins, 2272 del, 4939 sub ] exp/chain/tdnn_4c_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 19.7 | 4459 42989 | 82.6 11.8 5.6 2.3 19.7 57.4 | exp/chain/tdnn_4c_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.8 | 4459 42989 | 84.2 10.6 5.2 2.0 17.8 54.4 | exp/chain/tdnn_4c_sp/decode_eval2000_sw1_fsh_fg/score_10_1.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 4a +# %WER 17.88 [ 8800 / 49204, 1017 ins, 2233 del, 5550 sub ] exp/chain/tdnn_4a_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 16.73 [ 8231 / 49204, 898 ins, 2397 del, 4936 sub ] exp/chain/tdnn_4a_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0 +# %WER 19.7 | 4459 42989 | 82.5 12.0 5.5 2.3 19.7 57.6 | exp/chain/tdnn_4a_sp/decode_eval2000_sw1_tg/score_10_0.5/eval2000_hires.ctm.filt.sys +# %WER 17.8 | 4459 42989 | 84.2 10.3 5.5 2.0 17.8 55.1 | exp/chain/tdnn_4a_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys + + +# _4a is as _3s, but using narrower splice-indexes in the first layer. + +# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400. +# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p. +# This of course reduces overtraining. Results are a bit better than 3p but still +# not as good as 2y + +# ./show_wer.sh 3s +# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 3p +# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0 +# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0 +# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys +# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 2y +# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + + +# _3r is as _3p but reducing the number of parameters as it seemed to be +# overtraining (despite already being quite a small model): [600,1800 -> +# 500,1500]. Also in the interim there was a script change to +# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change. +# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent +# with the halving of the minibatch size.] + + +# _3p is the same as 3o, but after a code and script change so we can use +# natural gradient for the RepeatedAffineComponent. +# [natural gradient was helpful, based on logs; +# also made a change to use positive bias for the jesus-component affine parts.] + +# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2. + +# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support +# recurrence, with improvements to the learning of the jesus layers. + +# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found +# to be worse. +# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence +# is helpful.] +#./show_wer.sh 3g +#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys +#a03:s5c: ./show_wer.sh 2y +#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +#a03:s5c: ./show_wer.sh 3d +#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0 +#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +# _3f is as _3e, but modifying the splicing setup to add (left) recurrence: +# added the :3's in --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" +# Therefore it's +# no longer really a tdnn, more like an RNN combined with TDNN. BTW, I'm not re-dumping egs with extra +# context, and this isn't really ideal - I want to see if this seems promising first. + +# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default) +# to 200 in order to reduce computation in the Jesus layer. + +# _3d is as _2y, and re-using the egs, but using --jesus-opts and +# configs from make_jesus_configs.py. +# --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ +# --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" + +# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from +# 800k to 1.2 million. The aim is to avoid some of the per-job overhead +# (model-averaging, etc.), since each iteration takes only a minute or so. +# I added the results to the table below. It seems the same on average- +# which is good. We'll probably keep this configuration. + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# note: when I compare with the rerun of 2o (not shown), this run is actually +# better. +# WER on 2m 2o 2y [ now comparing 2o->2y:] +# train_dev,tg 17.22 17.24 16.99 0.2% better +# train_dev,fg 15.87 15.93 15.86 0.1% better +# eval2000,tg 18.7 18.7 18.9 0.2% worse +# eval2000,fg 17.0 16.9 17.0 0.1% worse + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_4g # Note: _sp will get added to this if $speed_perturb == true. + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=1.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --egs-dir exp/chain/tdnn_2y_sp/egs \ + --jesus-opts "--jesus-forward-input-dim 400 --jesus-forward-output-dim 1500 --jesus-hidden-dim 4000 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \ + --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" \ + --apply-deriv-weights false \ + --frames-per-iter 1200000 \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context 20 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_4n.sh b/egs/swbd/s5c/local/chain/run_tdnn_4n.sh new file mode 100644 index 00000000000..9125d4e7967 --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_4n.sh @@ -0,0 +1,386 @@ +#!/bin/bash + +# _4n is as _4f, but adding the [new] option --convert-repeated-to-block-iter=100. +# reusing iter 100 of model 4f to avoid some iterations of training [did this by +# doing (cd exp/chain; cp -r tdnn_4f_sp tdnn_4n_sp), and then running this script with +# --iter 100]. +# [note: to get the block-affine stuff to train fast enough to make a difference +# I multiplied a factor of sqrt(num-blocks) into the learning-rate factor in +# the code. That change is not committed.] +# +# Essentially no effect on WER, but train and valid probs are worse. +# ./compare_wer.sh 4f 4n +# System 4f 4n +# WER on train_dev(tg) 16.83 16.84 +# WER on train_dev(fg) 15.73 15.69 +# WER on eval2000(tg) 18.4 18.4 +# WER on eval2000(fg) 16.6 16.6 +# Final train prob -0.105832 -0.111309 +# Final valid prob -0.123021 -0.123601 + + +# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005. + +# It's even better than 4e, by about 0.3% abs. +# 4c 4e 4f +# Final valid prob: -0.1241 -0.1267 -0.1230 +# Final train prob: -0.08820 -0.1149 -0.1058 + +# ./show_wer.sh 4f +# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0 +# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 4e +# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0 +# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0 +# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys + + +# _4e is as _4c, but adding the option --l2-regularize 0.0001. + +# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000. + +# _4a is as _3s, but using narrower splice-indexes in the first layer. + +# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400. +# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p. +# This of course reduces overtraining. Results are a bit better than 3p but still +# not as good as 2y + +# ./show_wer.sh 3s +# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 3p +# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0 +# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0 +# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys +# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 2y +# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + + +# _3r is as _3p but reducing the number of parameters as it seemed to be +# overtraining (despite already being quite a small model): [600,1800 -> +# 500,1500]. Also in the interim there was a script change to +# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change. +# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent +# with the halving of the minibatch size.] + + +# _3p is the same as 3o, but after a code and script change so we can use +# natural gradient for the RepeatedAffineComponent. +# [natural gradient was helpful, based on logs; +# also made a change to use positive bias for the jesus-component affine parts.] + +# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2. + +# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support +# recurrence, with improvements to the learning of the jesus layers. + +# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found +# to be worse. +# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence +# is helpful.] +#./show_wer.sh 3g +#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys +#a03:s5c: ./show_wer.sh 2y +#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +#a03:s5c: ./show_wer.sh 3d +#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0 +#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +# _3f is as _3e, but modifying the splicing setup to add (left) recurrence: +# added the :3's in --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" +# Therefore it's +# no longer really a tdnn, more like an RNN combined with TDNN. BTW, I'm not re-dumping egs with extra +# context, and this isn't really ideal - I want to see if this seems promising first. + +# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default) +# to 200 in order to reduce computation in the Jesus layer. + +# _3d is as _2y, and re-using the egs, but using --jesus-opts and +# configs from make_jesus_configs.py. +# --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ +# --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" + +# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from +# 800k to 1.2 million. The aim is to avoid some of the per-job overhead +# (model-averaging, etc.), since each iteration takes only a minute or so. +# I added the results to the table below. It seems the same on average- +# which is good. We'll probably keep this configuration. + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# note: when I compare with the rerun of 2o (not shown), this run is actually +# better. +# WER on 2m 2o 2y [ now comparing 2o->2y:] +# train_dev,tg 17.22 17.24 16.99 0.2% better +# train_dev,fg 15.87 15.93 15.86 0.1% better +# eval2000,tg 18.7 18.7 18.9 0.2% worse +# eval2000,fg 17.0 16.9 17.0 0.1% worse + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_4n # Note: _sp will get added to this if $speed_perturb == true. + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=1.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --l2-regularize 0.00005 \ + --egs-dir exp/chain/tdnn_2y_sp/egs \ + --convert-repeated-to-block-iter 100 \ + --jesus-opts "--jesus-forward-input-dim 400 --jesus-forward-output-dim 1500 --jesus-hidden-dim 7500 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \ + --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" \ + --apply-deriv-weights false \ + --frames-per-iter 1200000 \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context 20 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_4p.sh b/egs/swbd/s5c/local/chain/run_tdnn_4p.sh new file mode 100755 index 00000000000..d2b073cdc77 --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_4p.sh @@ -0,0 +1,381 @@ +#!/bin/bash + +# _4p is as _4f, but one fewer layer, and making the final-layer context wider to +# compensate; also increasing the jesus-layer input and output dims 400->500 and 1500->1600 to +# somewhat compensate for the reduction in parameters. + +# definitely worse. Later with 4r I go in the opposite direction by adding a new layer, +# and get a small improvement. +# ./compare_wer.sh 4f 4p +# System 4f 4p +# WER on train_dev(tg) 16.83 17.36 +# WER on train_dev(fg) 15.73 16.10 +# WER on eval2000(tg) 18.4 19.1 +# WER on eval2000(fg) 16.6 17.2 +# Final train prob -0.105832 -0.104439 +# Final valid prob -0.123021 -0.125576 + +# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005. + +# It's even better than 4e, by about 0.3% abs. +# 4c 4e 4f +# Final valid prob: -0.1241 -0.1267 -0.1230 +# Final train prob: -0.08820 -0.1149 -0.1058 + +# ./show_wer.sh 4f +# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0 +# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 4e +# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0 +# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0 +# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys + + +# _4e is as _4c, but adding the option --l2-regularize 0.0001. + +# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000. + +# _4a is as _3s, but using narrower splice-indexes in the first layer. + +# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400. +# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p. +# This of course reduces overtraining. Results are a bit better than 3p but still +# not as good as 2y + +# ./show_wer.sh 3s +# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 3p +# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0 +# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0 +# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys +# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 2y +# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + + +# _3r is as _3p but reducing the number of parameters as it seemed to be +# overtraining (despite already being quite a small model): [600,1800 -> +# 500,1500]. Also in the interim there was a script change to +# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change. +# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent +# with the halving of the minibatch size.] + + +# _3p is the same as 3o, but after a code and script change so we can use +# natural gradient for the RepeatedAffineComponent. +# [natural gradient was helpful, based on logs; +# also made a change to use positive bias for the jesus-component affine parts.] + +# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2. + +# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support +# recurrence, with improvements to the learning of the jesus layers. + +# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found +# to be worse. +# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence +# is helpful.] +#./show_wer.sh 3g +#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys +#a03:s5c: ./show_wer.sh 2y +#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +#a03:s5c: ./show_wer.sh 3d +#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0 +#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +# _3f is as _3e, but modifying the splicing setup to add (left) recurrence: +# added the :3's in --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" +# Therefore it's +# no longer really a tdnn, more like an RNN combined with TDNN. BTW, I'm not re-dumping egs with extra +# context, and this isn't really ideal - I want to see if this seems promising first. + +# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default) +# to 200 in order to reduce computation in the Jesus layer. + +# _3d is as _2y, and re-using the egs, but using --jesus-opts and +# configs from make_jesus_configs.py. +# --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ +# --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" + +# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from +# 800k to 1.2 million. The aim is to avoid some of the per-job overhead +# (model-averaging, etc.), since each iteration takes only a minute or so. +# I added the results to the table below. It seems the same on average- +# which is good. We'll probably keep this configuration. + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# note: when I compare with the rerun of 2o (not shown), this run is actually +# better. +# WER on 2m 2o 2y [ now comparing 2o->2y:] +# train_dev,tg 17.22 17.24 16.99 0.2% better +# train_dev,fg 15.87 15.93 15.86 0.1% better +# eval2000,tg 18.7 18.7 18.9 0.2% worse +# eval2000,fg 17.0 16.9 17.0 0.1% worse + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_4p # Note: _sp will get added to this if $speed_perturb == true. + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=1.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --l2-regularize 0.00005 \ + --egs-dir exp/chain/tdnn_2y_sp/egs \ + --jesus-opts "--jesus-forward-input-dim 450 --jesus-forward-output-dim 1600 --jesus-hidden-dim 7500 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \ + --splice-indexes "-1,0,1 -1,0,1,2 -6,-3,0,3 -9,-6,-3,0,3,6" \ + --apply-deriv-weights false \ + --frames-per-iter 1200000 \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context 20 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_4q.sh b/egs/swbd/s5c/local/chain/run_tdnn_4q.sh new file mode 100755 index 00000000000..9f2534f4f22 --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_4q.sh @@ -0,0 +1,177 @@ +#!/bin/bash + +# this is based on Dan's tdnn_2o script +# it has a different splicing configuration +# it uses the PerDimensionWeightedAverage pooling in place of the Jesus layer + +set -e + +#%WER 11.1 | 1831 21395 | 90.2 6.3 3.5 1.3 11.1 46.6 | exp/chain/tdnn_v1_trial6_sp/decode_eval2000_sw1_fsh_fg/score_12_0.0/eval2000_hires.ctm.swbd.filt.sys +#%WER 16.6 | 4459 42989 | 85.2 9.5 5.3 1.8 16.6 53.4 | exp/chain/tdnn_v1_trial6_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +#%WER 15.59 [ 7671 / 49204, 883 ins, 2234 del, 4554 sub ] exp/chain/tdnn_v1_trial6_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0 + + +# configs for 'chain' +affix= +stage=10 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_4q # Note: _sp will get added to this if $speed_perturb == true. +decode_iter= + +# TDNN options +splice_indexes="-2,-1,0,1,2 -1,0,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" +# smoothing options +pool_window=7 +pool_type='per-dim-weighted-average' +pool_lpfilter_width= + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=1.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=64 +relu_dim=700 +frames_per_eg=150 +remove_egs=false +common_egs_dir= + + + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --apply-deriv-weights false \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --splice-indexes "$splice_indexes" \ + --pool-type "$pool_type" \ + --pool-window "$pool_window" \ + --pool-lpfilter-width "$pool_lpfilter_width" \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --final-layer-normalize-target $final_layer_normalize_target \ + --relu-dim $relu_dim \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + --egs-dir "$common_egs_dir" \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + iter_opts=" --iter $decode_iter " + fi + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj 50 --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_4r.sh b/egs/swbd/s5c/local/chain/run_tdnn_4r.sh new file mode 100755 index 00000000000..64831b5802a --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_4r.sh @@ -0,0 +1,380 @@ +#!/bin/bash + +# _4r is as _4f, but one more hidden layer, and reducing context of existing +# layers so we can re-use the egs. Reducing jesus-forward-output-dim slightly +# from 1500 to 1400. + +# This is better than 4f by almost all metrics. +# ./compare_wer.sh 4f 4r +# System 4f 4r +# WER on train_dev(tg) 16.83 16.50 +# WER on train_dev(fg) 15.73 15.45 +# WER on eval2000(tg) 18.4 18.3 +# WER on eval2000(fg) 16.6 16.7 +# Final train prob -0.105832 -0.103652 +# Final valid prob -0.123021 -0.121105 + +# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005. + +# It's even better than 4e, by about 0.3% abs. +# 4c 4e 4f +# Final valid prob: -0.1241 -0.1267 -0.1230 +# Final train prob: -0.08820 -0.1149 -0.1058 + +# ./show_wer.sh 4f +# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0 +# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 4e +# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0 +# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0 +# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys + + +# _4e is as _4c, but adding the option --l2-regularize 0.0001. + +# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000. + +# _4a is as _3s, but using narrower splice-indexes in the first layer. + +# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400. +# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p. +# This of course reduces overtraining. Results are a bit better than 3p but still +# not as good as 2y + +# ./show_wer.sh 3s +# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 3p +# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0 +# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0 +# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys +# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 2y +# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + + +# _3r is as _3p but reducing the number of parameters as it seemed to be +# overtraining (despite already being quite a small model): [600,1800 -> +# 500,1500]. Also in the interim there was a script change to +# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change. +# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent +# with the halving of the minibatch size.] + + +# _3p is the same as 3o, but after a code and script change so we can use +# natural gradient for the RepeatedAffineComponent. +# [natural gradient was helpful, based on logs; +# also made a change to use positive bias for the jesus-component affine parts.] + +# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2. + +# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support +# recurrence, with improvements to the learning of the jesus layers. + +# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found +# to be worse. +# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence +# is helpful.] +#./show_wer.sh 3g +#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys +#a03:s5c: ./show_wer.sh 2y +#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +#a03:s5c: ./show_wer.sh 3d +#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0 +#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +# _3f is as _3e, but modifying the splicing setup to add (left) recurrence: +# added the :3's in --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" +# Therefore it's +# no longer really a tdnn, more like an RNN combined with TDNN. BTW, I'm not re-dumping egs with extra +# context, and this isn't really ideal - I want to see if this seems promising first. + +# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default) +# to 200 in order to reduce computation in the Jesus layer. + +# _3d is as _2y, and re-using the egs, but using --jesus-opts and +# configs from make_jesus_configs.py. +# --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ +# --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" + +# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from +# 800k to 1.2 million. The aim is to avoid some of the per-job overhead +# (model-averaging, etc.), since each iteration takes only a minute or so. +# I added the results to the table below. It seems the same on average- +# which is good. We'll probably keep this configuration. + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# note: when I compare with the rerun of 2o (not shown), this run is actually +# better. +# WER on 2m 2o 2y [ now comparing 2o->2y:] +# train_dev,tg 17.22 17.24 16.99 0.2% better +# train_dev,fg 15.87 15.93 15.86 0.1% better +# eval2000,tg 18.7 18.7 18.9 0.2% worse +# eval2000,fg 17.0 16.9 17.0 0.1% worse + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_4r # Note: _sp will get added to this if $speed_perturb == true. + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=1.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --l2-regularize 0.00005 \ + --egs-dir exp/chain/tdnn_2y_sp/egs \ + --jesus-opts "--jesus-forward-input-dim 400 --jesus-forward-output-dim 1400 --jesus-hidden-dim 7500 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \ + --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3 -3,0,3 -6,-3,0" \ + --apply-deriv-weights false \ + --frames-per-iter 1200000 \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context 20 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_4s.sh b/egs/swbd/s5c/local/chain/run_tdnn_4s.sh new file mode 100755 index 00000000000..92a1a7da277 --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_4s.sh @@ -0,0 +1,380 @@ +#!/bin/bash + +# _4s is as _4f, but with --leaky-hmm-coefficient 0.02. [A new option- +#currently in a branch] +# Overall no real change. + +# ./compare_wer.sh 4f 4s +# System 4f 4s +# WER on train_dev(tg) 16.83 16.82 +# WER on train_dev(fg) 15.73 15.62 +# WER on eval2000(tg) 18.4 18.5 +# WER on eval2000(fg) 16.6 16.7 +# Final train prob -0.105832 -0.111371 +# Final valid prob -0.123021 -0.12648 + +# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005. + +# It's even better than 4e, by about 0.3% abs. +# 4c 4e 4f +# Final valid prob: -0.1241 -0.1267 -0.1230 +# Final train prob: -0.08820 -0.1149 -0.1058 + +# ./show_wer.sh 4f +# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0 +# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 4e +# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0 +# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0 +# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys + + +# _4e is as _4c, but adding the option --l2-regularize 0.0001. + +# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000. + +# _4a is as _3s, but using narrower splice-indexes in the first layer. + +# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400. +# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p. +# This of course reduces overtraining. Results are a bit better than 3p but still +# not as good as 2y + +# ./show_wer.sh 3s +# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 3p +# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0 +# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0 +# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys +# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 2y +# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + + +# _3r is as _3p but reducing the number of parameters as it seemed to be +# overtraining (despite already being quite a small model): [600,1800 -> +# 500,1500]. Also in the interim there was a script change to +# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change. +# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent +# with the halving of the minibatch size.] + + +# _3p is the same as 3o, but after a code and script change so we can use +# natural gradient for the RepeatedAffineComponent. +# [natural gradient was helpful, based on logs; +# also made a change to use positive bias for the jesus-component affine parts.] + +# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2. + +# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support +# recurrence, with improvements to the learning of the jesus layers. + +# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found +# to be worse. +# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence +# is helpful.] +#./show_wer.sh 3g +#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys +#a03:s5c: ./show_wer.sh 2y +#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +#a03:s5c: ./show_wer.sh 3d +#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0 +#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +# _3f is as _3e, but modifying the splicing setup to add (left) recurrence: +# added the :3's in --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" +# Therefore it's +# no longer really a tdnn, more like an RNN combined with TDNN. BTW, I'm not re-dumping egs with extra +# context, and this isn't really ideal - I want to see if this seems promising first. + +# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default) +# to 200 in order to reduce computation in the Jesus layer. + +# _3d is as _2y, and re-using the egs, but using --jesus-opts and +# configs from make_jesus_configs.py. +# --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ +# --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" + +# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from +# 800k to 1.2 million. The aim is to avoid some of the per-job overhead +# (model-averaging, etc.), since each iteration takes only a minute or so. +# I added the results to the table below. It seems the same on average- +# which is good. We'll probably keep this configuration. + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# note: when I compare with the rerun of 2o (not shown), this run is actually +# better. +# WER on 2m 2o 2y [ now comparing 2o->2y:] +# train_dev,tg 17.22 17.24 16.99 0.2% better +# train_dev,fg 15.87 15.93 15.86 0.1% better +# eval2000,tg 18.7 18.7 18.9 0.2% worse +# eval2000,fg 17.0 16.9 17.0 0.1% worse + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_4s # Note: _sp will get added to this if $speed_perturb == true. + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=1.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --leaky-hmm-coefficient 0.02 \ + --l2-regularize 0.00005 \ + --egs-dir exp/chain/tdnn_2y_sp/egs \ + --jesus-opts "--jesus-forward-input-dim 400 --jesus-forward-output-dim 1500 --jesus-hidden-dim 7500 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \ + --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" \ + --apply-deriv-weights false \ + --frames-per-iter 1200000 \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context 20 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_4t.sh b/egs/swbd/s5c/local/chain/run_tdnn_4t.sh new file mode 100755 index 00000000000..30b383d05d7 --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_4t.sh @@ -0,0 +1,382 @@ +#!/bin/bash + +# _4t is as _4s, but with --leaky-hmm-coefficient 0.04. + +# [note, I accidentally overwrote this directory afterwards, and moved it.] +# It's really not clear whether it's helpful. +# ./compare_wer.sh 4f 4t +# System 4f 4t +# WER on train_dev(tg) 16.83 16.75 +# WER on train_dev(fg) 15.73 15.45 +# WER on eval2000(tg) 18.4 18.5 +# WER on eval2000(fg) 16.6 16.7 +# Final train prob -0.105832 -0.112721 +# Final valid prob -0.123021 -0.129688 + +# _4s is as _4f, but with --leaky-hmm-coefficient 0.02. [A new option.] + +# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005. + +# It's even better than 4e, by about 0.3% abs. +# 4c 4e 4f +# Final valid prob: -0.1241 -0.1267 -0.1230 +# Final train prob: -0.08820 -0.1149 -0.1058 + +# ./show_wer.sh 4f +# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0 +# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 4e +# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0 +# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0 +# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys + + +# _4e is as _4c, but adding the option --l2-regularize 0.0001. + +# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000. + +# _4a is as _3s, but using narrower splice-indexes in the first layer. + +# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400. +# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p. +# This of course reduces overtraining. Results are a bit better than 3p but still +# not as good as 2y + +# ./show_wer.sh 3s +# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 3p +# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0 +# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0 +# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys +# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 2y +# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + + +# _3r is as _3p but reducing the number of parameters as it seemed to be +# overtraining (despite already being quite a small model): [600,1800 -> +# 500,1500]. Also in the interim there was a script change to +# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change. +# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent +# with the halving of the minibatch size.] + + +# _3p is the same as 3o, but after a code and script change so we can use +# natural gradient for the RepeatedAffineComponent. +# [natural gradient was helpful, based on logs; +# also made a change to use positive bias for the jesus-component affine parts.] + +# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2. + +# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support +# recurrence, with improvements to the learning of the jesus layers. + +# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found +# to be worse. +# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence +# is helpful.] +#./show_wer.sh 3g +#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys +#a03:s5c: ./show_wer.sh 2y +#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +#a03:s5c: ./show_wer.sh 3d +#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0 +#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +# _3f is as _3e, but modifying the splicing setup to add (left) recurrence: +# added the :3's in --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" +# Therefore it's +# no longer really a tdnn, more like an RNN combined with TDNN. BTW, I'm not re-dumping egs with extra +# context, and this isn't really ideal - I want to see if this seems promising first. + +# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default) +# to 200 in order to reduce computation in the Jesus layer. + +# _3d is as _2y, and re-using the egs, but using --jesus-opts and +# configs from make_jesus_configs.py. +# --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ +# --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" + +# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from +# 800k to 1.2 million. The aim is to avoid some of the per-job overhead +# (model-averaging, etc.), since each iteration takes only a minute or so. +# I added the results to the table below. It seems the same on average- +# which is good. We'll probably keep this configuration. + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# note: when I compare with the rerun of 2o (not shown), this run is actually +# better. +# WER on 2m 2o 2y [ now comparing 2o->2y:] +# train_dev,tg 17.22 17.24 16.99 0.2% better +# train_dev,fg 15.87 15.93 15.86 0.1% better +# eval2000,tg 18.7 18.7 18.9 0.2% worse +# eval2000,fg 17.0 16.9 17.0 0.1% worse + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_4u # Note: _sp will get added to this if $speed_perturb == true. + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=1.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --leaky-hmm-coefficient 0.08 \ + --l2-regularize 0.00005 \ + --egs-dir exp/chain/tdnn_2y_sp/egs \ + --jesus-opts "--jesus-forward-input-dim 400 --jesus-forward-output-dim 1500 --jesus-hidden-dim 7500 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \ + --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" \ + --apply-deriv-weights false \ + --frames-per-iter 1200000 \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context 20 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_4u.sh b/egs/swbd/s5c/local/chain/run_tdnn_4u.sh new file mode 100755 index 00000000000..ae7cf02b426 --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_4u.sh @@ -0,0 +1,384 @@ +#!/bin/bash + +# _4u is as _4t, but with --leaky-hmm-coefficient 0.08. Note: the +# ultimate baseline is 4f. + +# It seems a bit better on average. +#./compare_wer.sh 4f 4u +#System 4f 4u +#WER on train_dev(tg) 16.83 16.47 +#WER on train_dev(fg) 15.73 15.23 +#WER on eval2000(tg) 18.4 18.4 +#WER on eval2000(fg) 16.6 16.7 +#Final train prob -0.105832 -0.118911 +#Final valid prob -0.123021 -0.135768 + +# _4t is as _4s, but with --leaky-hmm-coefficient 0.04. + +# _4s is as _4f, but with --leaky-hmm-coefficient 0.02. [A new option.] + +# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005. + +# It's even better than 4e, by about 0.3% abs. +# 4c 4e 4f +# Final valid prob: -0.1241 -0.1267 -0.1230 +# Final train prob: -0.08820 -0.1149 -0.1058 + +# ./show_wer.sh 4f +# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0 +# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 4e +# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0 +# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0 +# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys + + +# _4e is as _4c, but adding the option --l2-regularize 0.0001. + +# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000. + +# _4a is as _3s, but using narrower splice-indexes in the first layer. + +# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400. +# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p. +# This of course reduces overtraining. Results are a bit better than 3p but still +# not as good as 2y + +# ./show_wer.sh 3s +# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 3p +# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0 +# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0 +# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys +# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 2y +# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + + +# _3r is as _3p but reducing the number of parameters as it seemed to be +# overtraining (despite already being quite a small model): [600,1800 -> +# 500,1500]. Also in the interim there was a script change to +# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change. +# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent +# with the halving of the minibatch size.] + + +# _3p is the same as 3o, but after a code and script change so we can use +# natural gradient for the RepeatedAffineComponent. +# [natural gradient was helpful, based on logs; +# also made a change to use positive bias for the jesus-component affine parts.] + +# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2. + +# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support +# recurrence, with improvements to the learning of the jesus layers. + +# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found +# to be worse. +# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence +# is helpful.] +#./show_wer.sh 3g +#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys +#a03:s5c: ./show_wer.sh 2y +#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +#a03:s5c: ./show_wer.sh 3d +#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0 +#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +# _3f is as _3e, but modifying the splicing setup to add (left) recurrence: +# added the :3's in --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" +# Therefore it's +# no longer really a tdnn, more like an RNN combined with TDNN. BTW, I'm not re-dumping egs with extra +# context, and this isn't really ideal - I want to see if this seems promising first. + +# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default) +# to 200 in order to reduce computation in the Jesus layer. + +# _3d is as _2y, and re-using the egs, but using --jesus-opts and +# configs from make_jesus_configs.py. +# --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ +# --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" + +# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from +# 800k to 1.2 million. The aim is to avoid some of the per-job overhead +# (model-averaging, etc.), since each iteration takes only a minute or so. +# I added the results to the table below. It seems the same on average- +# which is good. We'll probably keep this configuration. + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# note: when I compare with the rerun of 2o (not shown), this run is actually +# better. +# WER on 2m 2o 2y [ now comparing 2o->2y:] +# train_dev,tg 17.22 17.24 16.99 0.2% better +# train_dev,fg 15.87 15.93 15.86 0.1% better +# eval2000,tg 18.7 18.7 18.9 0.2% worse +# eval2000,fg 17.0 16.9 17.0 0.1% worse + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_4t # Note: _sp will get added to this if $speed_perturb == true. + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=1.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --leaky-hmm-coefficient 0.08 \ + --l2-regularize 0.00005 \ + --egs-dir exp/chain/tdnn_2y_sp/egs \ + --jesus-opts "--jesus-forward-input-dim 400 --jesus-forward-output-dim 1500 --jesus-hidden-dim 7500 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \ + --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" \ + --apply-deriv-weights false \ + --frames-per-iter 1200000 \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context 20 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_4v.sh b/egs/swbd/s5c/local/chain/run_tdnn_4v.sh new file mode 100755 index 00000000000..9cdbfefb5a2 --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_4v.sh @@ -0,0 +1,394 @@ +#!/bin/bash + +# _4v is as _4r, but with --xent-regularize 0.1. Increasing max_param_change +# from 1.0 to 2.0 because there is a lot of parameter change in the final xent +# layer, and this limits the rate of change of the other layers. + +#./compare_wer.sh 4r 4v +#System 4r 4v +#WER on train_dev(tg) 16.50 15.95 +#WER on train_dev(fg) 15.45 14.69 +#WER on eval2000(tg) 18.3 17.7 +#WER on eval2000(fg) 16.7 16.0 +#Final train prob -0.103652 -0.106646 -1.60775 +#Final valid prob -0.121105 -0.118631 -1.62832 + +# _4r is as _4f, but one more hidden layer, and reducing context of existing +# layers so we can re-use the egs. Reducing jesus-forward-output-dim slightly +# from 1500 to 1400. + +# This is better than 4f by almost all metrics. +# ./compare_wer.sh 4f 4r +# System 4f 4r +# WER on train_dev(tg) 16.83 16.50 +# WER on train_dev(fg) 15.73 15.45 +# WER on eval2000(tg) 18.4 18.3 +# WER on eval2000(fg) 16.6 16.7 +# Final train prob -0.105832 -0.103652 +# Final valid prob -0.123021 -0.121105 + +# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005. + +# It's even better than 4e, by about 0.3% abs. +# 4c 4e 4f +# Final valid prob: -0.1241 -0.1267 -0.1230 +# Final train prob: -0.08820 -0.1149 -0.1058 + +# ./show_wer.sh 4f +# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0 +# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 4e +# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0 +# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0 +# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys + + +# _4e is as _4c, but adding the option --l2-regularize 0.0001. + +# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000. + +# _4a is as _3s, but using narrower splice-indexes in the first layer. + +# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400. +# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p. +# This of course reduces overtraining. Results are a bit better than 3p but still +# not as good as 2y + +# ./show_wer.sh 3s +# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 3p +# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0 +# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0 +# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys +# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 2y +# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + + +# _3r is as _3p but reducing the number of parameters as it seemed to be +# overtraining (despite already being quite a small model): [600,1800 -> +# 500,1500]. Also in the interim there was a script change to +# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change. +# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent +# with the halving of the minibatch size.] + + +# _3p is the same as 3o, but after a code and script change so we can use +# natural gradient for the RepeatedAffineComponent. +# [natural gradient was helpful, based on logs; +# also made a change to use positive bias for the jesus-component affine parts.] + +# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2. + +# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support +# recurrence, with improvements to the learning of the jesus layers. + +# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found +# to be worse. +# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence +# is helpful.] +#./show_wer.sh 3g +#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys +#a03:s5c: ./show_wer.sh 2y +#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +#a03:s5c: ./show_wer.sh 3d +#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0 +#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +# _3f is as _3e, but modifying the splicing setup to add (left) recurrence: +# added the :3's in --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" +# Therefore it's +# no longer really a tdnn, more like an RNN combined with TDNN. BTW, I'm not re-dumping egs with extra +# context, and this isn't really ideal - I want to see if this seems promising first. + +# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default) +# to 200 in order to reduce computation in the Jesus layer. + +# _3d is as _2y, and re-using the egs, but using --jesus-opts and +# configs from make_jesus_configs.py. +# --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ +# --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" + +# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from +# 800k to 1.2 million. The aim is to avoid some of the per-job overhead +# (model-averaging, etc.), since each iteration takes only a minute or so. +# I added the results to the table below. It seems the same on average- +# which is good. We'll probably keep this configuration. + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# note: when I compare with the rerun of 2o (not shown), this run is actually +# better. +# WER on 2m 2o 2y [ now comparing 2o->2y:] +# train_dev,tg 17.22 17.24 16.99 0.2% better +# train_dev,fg 15.87 15.93 15.86 0.1% better +# eval2000,tg 18.7 18.7 18.9 0.2% worse +# eval2000,fg 17.0 16.9 17.0 0.1% worse + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_4v # Note: _sp will get added to this if $speed_perturb == true. + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=2.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --xent-regularize 0.1 \ + --l2-regularize 0.00005 \ + --egs-dir exp/chain/tdnn_2y_sp/egs \ + --jesus-opts "--jesus-forward-input-dim 400 --jesus-forward-output-dim 1400 --jesus-hidden-dim 7500 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \ + --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3 -3,0,3 -6,-3,0" \ + --apply-deriv-weights false \ + --frames-per-iter 1200000 \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context 20 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_4w.sh b/egs/swbd/s5c/local/chain/run_tdnn_4w.sh new file mode 100755 index 00000000000..6dd5c587f7a --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_4w.sh @@ -0,0 +1,397 @@ +#!/bin/bash + +# _4w is as _4v, but doubling --xent-regularize to 0.2 WER seems consistently a +# bit worse, although final valid prob is very slightly better. + +#./compare_wer.sh 4v 4w +#System 4v 4w +#WER on train_dev(tg) 15.95 16.05 +#WER on train_dev(fg) 14.69 14.92 +#WER on eval2000(tg) 17.7 18.0 +#WER on eval2000(fg) 16.0 16.2 +#Final train prob -0.106646 -0.108816 +#Final valid prob -0.118631 -0.118254 + +# _4v is as _4r, but with --xent-regularize 0.1. Increasing max_param_change +# from 1.0 to 2.0 because there is a lot of parameter change in the final xent +# layer, and this limits the rate of change of the other layers. + +# _4r is as _4f, but one more hidden layer, and reducing context of existing +# layers so we can re-use the egs. Reducing jesus-forward-output-dim slightly +# from 1500 to 1400. + +# This is better than 4f by almost all metrics. +# ./compare_wer.sh 4f 4r +# System 4f 4r +# WER on train_dev(tg) 16.83 16.50 +# WER on train_dev(fg) 15.73 15.45 +# WER on eval2000(tg) 18.4 18.3 +# WER on eval2000(fg) 16.6 16.7 +# Final train prob -0.105832 -0.103652 +# Final valid prob -0.123021 -0.121105 + +# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005. + +# It's even better than 4e, by about 0.3% abs. +# 4c 4e 4f +# Final valid prob: -0.1241 -0.1267 -0.1230 +# Final train prob: -0.08820 -0.1149 -0.1058 + +# ./show_wer.sh 4f +# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0 +# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 4e +# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0 +# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0 +# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys + + +# _4e is as _4c, but adding the option --l2-regularize 0.0001. + +# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000. + +# _4a is as _3s, but using narrower splice-indexes in the first layer. + +# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400. +# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p. +# This of course reduces overtraining. Results are a bit better than 3p but still +# not as good as 2y + +# ./show_wer.sh 3s +# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 3p +# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0 +# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0 +# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys +# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 2y +# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + + +# _3r is as _3p but reducing the number of parameters as it seemed to be +# overtraining (despite already being quite a small model): [600,1800 -> +# 500,1500]. Also in the interim there was a script change to +# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change. +# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent +# with the halving of the minibatch size.] + + +# _3p is the same as 3o, but after a code and script change so we can use +# natural gradient for the RepeatedAffineComponent. +# [natural gradient was helpful, based on logs; +# also made a change to use positive bias for the jesus-component affine parts.] + +# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2. + +# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support +# recurrence, with improvements to the learning of the jesus layers. + +# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found +# to be worse. +# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence +# is helpful.] +#./show_wer.sh 3g +#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys +#a03:s5c: ./show_wer.sh 2y +#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +#a03:s5c: ./show_wer.sh 3d +#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0 +#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +# _3f is as _3e, but modifying the splicing setup to add (left) recurrence: +# added the :3's in --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" +# Therefore it's +# no longer really a tdnn, more like an RNN combined with TDNN. BTW, I'm not re-dumping egs with extra +# context, and this isn't really ideal - I want to see if this seems promising first. + +# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default) +# to 200 in order to reduce computation in the Jesus layer. + +# _3d is as _2y, and re-using the egs, but using --jesus-opts and +# configs from make_jesus_configs.py. +# --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ +# --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" + +# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from +# 800k to 1.2 million. The aim is to avoid some of the per-job overhead +# (model-averaging, etc.), since each iteration takes only a minute or so. +# I added the results to the table below. It seems the same on average- +# which is good. We'll probably keep this configuration. + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# note: when I compare with the rerun of 2o (not shown), this run is actually +# better. +# WER on 2m 2o 2y [ now comparing 2o->2y:] +# train_dev,tg 17.22 17.24 16.99 0.2% better +# train_dev,fg 15.87 15.93 15.86 0.1% better +# eval2000,tg 18.7 18.7 18.9 0.2% worse +# eval2000,fg 17.0 16.9 17.0 0.1% worse + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_4w # Note: _sp will get added to this if $speed_perturb == true. + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=2.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --xent-regularize 0.2 \ + --l2-regularize 0.00005 \ + --egs-dir exp/chain/tdnn_2y_sp/egs \ + --jesus-opts "--jesus-forward-input-dim 400 --jesus-forward-output-dim 1400 --jesus-hidden-dim 7500 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \ + --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3 -3,0,3 -6,-3,0" \ + --apply-deriv-weights false \ + --frames-per-iter 1200000 \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context 20 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_4x.sh b/egs/swbd/s5c/local/chain/run_tdnn_4x.sh new file mode 100755 index 00000000000..0290e0bdbd5 --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_4x.sh @@ -0,0 +1,396 @@ +#!/bin/bash + +# _4x is as _4u, but with --leaky-hmm-coefficient 0.2. Note: the +# ultimate baseline is 4f. It seems a little bit worse than 4u on average: (+0.2, +0.2, 0.0, -0.1). +# So I'm guessing the best value is around --leaky-hmm-coefficient 0.1. +# +# ./compare_wer.sh 4f 4u 4x +# System 4f 4u 4x +# WER on train_dev(tg) 16.83 16.47 16.63 +# WER on train_dev(fg) 15.73 15.23 15.42 +# WER on eval2000(tg) 18.4 18.4 18.4 +# WER on eval2000(fg) 16.6 16.7 16.6 +# Final train prob -0.105832 -0.118911 -0.130674 +# Final valid prob -0.123021 -0.135768 -0.146351 + +# _4u is as _4t, but with --leaky-hmm-coefficient 0.08. Note: the +# ultimate baseline is 4f. + +#./compare_wer.sh 4f 4u +#System 4f 4u +#WER on train_dev(tg) 16.83 16.47 +#WER on train_dev(fg) 15.73 15.23 +#WER on eval2000(tg) 18.4 18.4 +#WER on eval2000(fg) 16.6 16.7 +#Final train prob -0.105832 -0.118911 +#Final valid prob -0.123021 -0.135768 + +# _4t is as _4s, but with --leaky-hmm-coefficient 0.04. + +# _4s is as _4f, but with --leaky-hmm-coefficient 0.02. [A new option.] + +# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005. + +# It's even better than 4e, by about 0.3% abs. +# 4c 4e 4f +# Final valid prob: -0.1241 -0.1267 -0.1230 +# Final train prob: -0.08820 -0.1149 -0.1058 + +# ./show_wer.sh 4f +# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0 +# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 4e +# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0 +# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0 +# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys + + +# _4e is as _4c, but adding the option --l2-regularize 0.0001. + +# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000. + +# _4a is as _3s, but using narrower splice-indexes in the first layer. + +# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400. +# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p. +# This of course reduces overtraining. Results are a bit better than 3p but still +# not as good as 2y + +# ./show_wer.sh 3s +# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 3p +# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0 +# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0 +# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys +# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 2y +# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + + +# _3r is as _3p but reducing the number of parameters as it seemed to be +# overtraining (despite already being quite a small model): [600,1800 -> +# 500,1500]. Also in the interim there was a script change to +# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change. +# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent +# with the halving of the minibatch size.] + + +# _3p is the same as 3o, but after a code and script change so we can use +# natural gradient for the RepeatedAffineComponent. +# [natural gradient was helpful, based on logs; +# also made a change to use positive bias for the jesus-component affine parts.] + +# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2. + +# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support +# recurrence, with improvements to the learning of the jesus layers. + +# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found +# to be worse. +# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence +# is helpful.] +#./show_wer.sh 3g +#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys +#a03:s5c: ./show_wer.sh 2y +#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +#a03:s5c: ./show_wer.sh 3d +#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0 +#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +# _3f is as _3e, but modifying the splicing setup to add (left) recurrence: +# added the :3's in --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" +# Therefore it's +# no longer really a tdnn, more like an RNN combined with TDNN. BTW, I'm not re-dumping egs with extra +# context, and this isn't really ideal - I want to see if this seems promising first. + +# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default) +# to 200 in order to reduce computation in the Jesus layer. + +# _3d is as _2y, and re-using the egs, but using --jesus-opts and +# configs from make_jesus_configs.py. +# --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ +# --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" + +# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from +# 800k to 1.2 million. The aim is to avoid some of the per-job overhead +# (model-averaging, etc.), since each iteration takes only a minute or so. +# I added the results to the table below. It seems the same on average- +# which is good. We'll probably keep this configuration. + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# note: when I compare with the rerun of 2o (not shown), this run is actually +# better. +# WER on 2m 2o 2y [ now comparing 2o->2y:] +# train_dev,tg 17.22 17.24 16.99 0.2% better +# train_dev,fg 15.87 15.93 15.86 0.1% better +# eval2000,tg 18.7 18.7 18.9 0.2% worse +# eval2000,fg 17.0 16.9 17.0 0.1% worse + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_4x # Note: _sp will get added to this if $speed_perturb == true. + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=1.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --leaky-hmm-coefficient 0.2 \ + --l2-regularize 0.00005 \ + --egs-dir exp/chain/tdnn_2y_sp/egs \ + --jesus-opts "--jesus-forward-input-dim 400 --jesus-forward-output-dim 1500 --jesus-hidden-dim 7500 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \ + --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" \ + --apply-deriv-weights false \ + --frames-per-iter 1200000 \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context 20 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_5a.sh b/egs/swbd/s5c/local/chain/run_tdnn_5a.sh new file mode 100755 index 00000000000..cd1de07a80d --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_5a.sh @@ -0,0 +1,401 @@ +#!/bin/bash + +# _5a is as _4w, but increasing jesus-forward-output-dim from 1400 to 1800, and +# jesus-forward-input-dim from 400 to 500. Hoping that the cross-entropy regularization +# will mean that the increased parameters are now helpful. Very helpful (between 0.2% +# and 0.6%). + +#./compare_wer.sh 4w 5a +#System 4w 5a +#WER on train_dev(tg) 16.05 15.86 +#WER on train_dev(fg) 14.92 14.74 +#WER on eval2000(tg) 18.0 17.4 +#WER on eval2000(fg) 16.2 15.6 +#Final train prob -0.108816-0.0998359 +#Final valid prob -0.118254 -0.115884 + +# _4w is as _4v, but doubling --xent-regularize to 0.2 + +# _4v is as _4r, but with --xent-regularize 0.1. Increasing max_param_change +# from 1.0 to 2.0 because there is a lot of parameter change in the final xent +# layer, and this limits the rate of change of the other layers. + +# _4r is as _4f, but one more hidden layer, and reducing context of existing +# layers so we can re-use the egs. Reducing jesus-forward-output-dim slightly +# from 1500 to 1400. + +# This is better than 4f by almost all metrics. +# ./compare_wer.sh 4f 4r +# System 4f 4r +# WER on train_dev(tg) 16.83 16.50 +# WER on train_dev(fg) 15.73 15.45 +# WER on eval2000(tg) 18.4 18.3 +# WER on eval2000(fg) 16.6 16.7 +# Final train prob -0.105832 -0.103652 +# Final valid prob -0.123021 -0.121105 + +# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005. + +# It's even better than 4e, by about 0.3% abs. +# 4c 4e 4f +# Final valid prob: -0.1241 -0.1267 -0.1230 +# Final train prob: -0.08820 -0.1149 -0.1058 + +# ./show_wer.sh 4f +# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0 +# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 4e +# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0 +# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0 +# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys + + +# _4e is as _4c, but adding the option --l2-regularize 0.0001. + +# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000. + +# _4a is as _3s, but using narrower splice-indexes in the first layer. + +# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400. +# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p. +# This of course reduces overtraining. Results are a bit better than 3p but still +# not as good as 2y + +# ./show_wer.sh 3s +# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 3p +# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0 +# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0 +# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys +# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 2y +# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + + +# _3r is as _3p but reducing the number of parameters as it seemed to be +# overtraining (despite already being quite a small model): [600,1800 -> +# 500,1500]. Also in the interim there was a script change to +# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change. +# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent +# with the halving of the minibatch size.] + + +# _3p is the same as 3o, but after a code and script change so we can use +# natural gradient for the RepeatedAffineComponent. +# [natural gradient was helpful, based on logs; +# also made a change to use positive bias for the jesus-component affine parts.] + +# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2. + +# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support +# recurrence, with improvements to the learning of the jesus layers. + +# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found +# to be worse. +# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence +# is helpful.] +#./show_wer.sh 3g +#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys +#a03:s5c: ./show_wer.sh 2y +#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +#a03:s5c: ./show_wer.sh 3d +#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0 +#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +# _3f is as _3e, but modifying the splicing setup to add (left) recurrence: +# added the :3's in --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" +# Therefore it's +# no longer really a tdnn, more like an RNN combined with TDNN. BTW, I'm not re-dumping egs with extra +# context, and this isn't really ideal - I want to see if this seems promising first. + +# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default) +# to 200 in order to reduce computation in the Jesus layer. + +# _3d is as _2y, and re-using the egs, but using --jesus-opts and +# configs from make_jesus_configs.py. +# --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ +# --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" + +# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from +# 800k to 1.2 million. The aim is to avoid some of the per-job overhead +# (model-averaging, etc.), since each iteration takes only a minute or so. +# I added the results to the table below. It seems the same on average- +# which is good. We'll probably keep this configuration. + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# note: when I compare with the rerun of 2o (not shown), this run is actually +# better. +# WER on 2m 2o 2y [ now comparing 2o->2y:] +# train_dev,tg 17.22 17.24 16.99 0.2% better +# train_dev,fg 15.87 15.93 15.86 0.1% better +# eval2000,tg 18.7 18.7 18.9 0.2% worse +# eval2000,fg 17.0 16.9 17.0 0.1% worse + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_5a # Note: _sp will get added to this if $speed_perturb == true. + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=2.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --xent-regularize 0.2 \ + --l2-regularize 0.00005 \ + --egs-dir exp/chain/tdnn_2y_sp/egs \ + --jesus-opts "--jesus-forward-input-dim 500 --jesus-forward-output-dim 1800 --jesus-hidden-dim 7500 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \ + --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3 -3,0,3 -6,-3,0" \ + --apply-deriv-weights false \ + --frames-per-iter 1200000 \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context 20 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_5b.sh b/egs/swbd/s5c/local/chain/run_tdnn_5b.sh new file mode 100755 index 00000000000..7e44c10920e --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_5b.sh @@ -0,0 +1,404 @@ +#!/bin/bash + +# _5b is as _5a, but adding --leaky-hmm-coefficient 0.1. + +# It does seem helpful on average: (-0.35, -0.35, -0.1, 0). +#./compare_wer.sh 5a 5b +#System 5a 5b +#WER on train_dev(tg) 15.86 15.51 +#WER on train_dev(fg) 14.74 14.39 +#WER on eval2000(tg) 17.4 17.3 +#WER on eval2000(fg) 15.6 15.6 +#Final train prob -0.0998359 -0.112013 +#Final valid prob -0.115884 -0.130879 + +# _5a is as _4w, but increasing jesus-forward-output-dim from 1400 to 1800, and +# jesus-forward-input-dim from 400 to 500. Hoping that the cross-entropy regularization +# will mean that the increased parameters are now helpful. + +# _4w is as _4v, but doubling --xent-regularize to 0.2 + +# _4v is as _4r, but with --xent-regularize 0.1. Increasing max_param_change +# from 1.0 to 2.0 because there is a lot of parameter change in the final xent +# layer, and this limits the rate of change of the other layers. + +# _4r is as _4f, but one more hidden layer, and reducing context of existing +# layers so we can re-use the egs. Reducing jesus-forward-output-dim slightly +# from 1500 to 1400. + +# This is better than 4f by almost all metrics. +# ./compare_wer.sh 4f 4r +# System 4f 4r +# WER on train_dev(tg) 16.83 16.50 +# WER on train_dev(fg) 15.73 15.45 +# WER on eval2000(tg) 18.4 18.3 +# WER on eval2000(fg) 16.6 16.7 +# Final train prob -0.105832 -0.103652 +# Final valid prob -0.123021 -0.121105 + +# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005. + +# It's even better than 4e, by about 0.3% abs. +# 4c 4e 4f +# Final valid prob: -0.1241 -0.1267 -0.1230 +# Final train prob: -0.08820 -0.1149 -0.1058 + +# ./show_wer.sh 4f +# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0 +# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 4e +# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0 +# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0 +# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys + + +# _4e is as _4c, but adding the option --l2-regularize 0.0001. + +# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000. + +# _4a is as _3s, but using narrower splice-indexes in the first layer. + +# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400. +# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p. +# This of course reduces overtraining. Results are a bit better than 3p but still +# not as good as 2y + +# ./show_wer.sh 3s +# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 3p +# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0 +# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0 +# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys +# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 2y +# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + + +# _3r is as _3p but reducing the number of parameters as it seemed to be +# overtraining (despite already being quite a small model): [600,1800 -> +# 500,1500]. Also in the interim there was a script change to +# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change. +# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent +# with the halving of the minibatch size.] + + +# _3p is the same as 3o, but after a code and script change so we can use +# natural gradient for the RepeatedAffineComponent. +# [natural gradient was helpful, based on logs; +# also made a change to use positive bias for the jesus-component affine parts.] + +# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2. + +# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support +# recurrence, with improvements to the learning of the jesus layers. + +# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found +# to be worse. +# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence +# is helpful.] +#./show_wer.sh 3g +#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys +#a03:s5c: ./show_wer.sh 2y +#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +#a03:s5c: ./show_wer.sh 3d +#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0 +#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +# _3f is as _3e, but modifying the splicing setup to add (left) recurrence: +# added the :3's in --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" +# Therefore it's +# no longer really a tdnn, more like an RNN combined with TDNN. BTW, I'm not re-dumping egs with extra +# context, and this isn't really ideal - I want to see if this seems promising first. + +# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default) +# to 200 in order to reduce computation in the Jesus layer. + +# _3d is as _2y, and re-using the egs, but using --jesus-opts and +# configs from make_jesus_configs.py. +# --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ +# --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" + +# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from +# 800k to 1.2 million. The aim is to avoid some of the per-job overhead +# (model-averaging, etc.), since each iteration takes only a minute or so. +# I added the results to the table below. It seems the same on average- +# which is good. We'll probably keep this configuration. + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# note: when I compare with the rerun of 2o (not shown), this run is actually +# better. +# WER on 2m 2o 2y [ now comparing 2o->2y:] +# train_dev,tg 17.22 17.24 16.99 0.2% better +# train_dev,fg 15.87 15.93 15.86 0.1% better +# eval2000,tg 18.7 18.7 18.9 0.2% worse +# eval2000,fg 17.0 16.9 17.0 0.1% worse + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_5b # Note: _sp will get added to this if $speed_perturb == true. + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=2.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --xent-regularize 0.2 \ + --leaky-hmm-coefficient 0.1 \ + --l2-regularize 0.00005 \ + --egs-dir exp/chain/tdnn_2y_sp/egs \ + --jesus-opts "--jesus-forward-input-dim 500 --jesus-forward-output-dim 1800 --jesus-hidden-dim 7500 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \ + --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3 -3,0,3 -6,-3,0" \ + --apply-deriv-weights false \ + --frames-per-iter 1200000 \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context 20 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_5c.sh b/egs/swbd/s5c/local/chain/run_tdnn_5c.sh new file mode 100755 index 00000000000..93ebb59b16d --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_5c.sh @@ -0,0 +1,409 @@ +#!/bin/bash + +# _5c is as _4w, but changing --xent-regularize to 0.05, since 0.2 seemed to be +# worse than 0.1. +# It seems a little worse on average: WER change is (+0.3, +0.3, -0.2, +0.2). +#System 4w 5c +#WER on train_dev(tg) 16.05 16.35 +#WER on train_dev(fg) 14.92 15.21 +#WER on eval2000(tg) 18.0 17.8 +#WER on eval2000(fg) 16.2 16.4 +#Final train prob -0.108816 -0.107098 +#Final valid prob -0.118254 -0.118209 + +# _4w is as _4v, but doubling --xent-regularize to 0.2. WER seems consistently +# a bit worse (+0.1, +0.2, +0.3, +0.2), although final valid prob is very +# slightly better. + +#./compare_wer.sh 4v 4w +#System 4v 4w +#WER on train_dev(tg) 15.95 16.05 +#WER on train_dev(fg) 14.69 14.92 +#WER on eval2000(tg) 17.7 18.0 +#WER on eval2000(fg) 16.0 16.2 +#Final train prob -0.106646 -0.108816 +#Final valid prob -0.118631 -0.118254 + +# _4v is as _4r, but with --xent-regularize 0.1. Increasing max_param_change +# from 1.0 to 2.0 because there is a lot of parameter change in the final xent +# layer, and this limits the rate of change of the other layers. + +# _4r is as _4f, but one more hidden layer, and reducing context of existing +# layers so we can re-use the egs. Reducing jesus-forward-output-dim slightly +# from 1500 to 1400. + +# This is better than 4f by almost all metrics. +# ./compare_wer.sh 4f 4r +# System 4f 4r +# WER on train_dev(tg) 16.83 16.50 +# WER on train_dev(fg) 15.73 15.45 +# WER on eval2000(tg) 18.4 18.3 +# WER on eval2000(fg) 16.6 16.7 +# Final train prob -0.105832 -0.103652 +# Final valid prob -0.123021 -0.121105 + +# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005. + +# It's even better than 4e, by about 0.3% abs. +# 4c 4e 4f +# Final valid prob: -0.1241 -0.1267 -0.1230 +# Final train prob: -0.08820 -0.1149 -0.1058 + +# ./show_wer.sh 4f +# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0 +# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 4e +# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0 +# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0 +# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys + + +# _4e is as _4c, but adding the option --l2-regularize 0.0001. + +# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000. + +# _4a is as _3s, but using narrower splice-indexes in the first layer. + +# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400. +# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p. +# This of course reduces overtraining. Results are a bit better than 3p but still +# not as good as 2y + +# ./show_wer.sh 3s +# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 3p +# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0 +# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0 +# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys +# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 2y +# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + + +# _3r is as _3p but reducing the number of parameters as it seemed to be +# overtraining (despite already being quite a small model): [600,1800 -> +# 500,1500]. Also in the interim there was a script change to +# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change. +# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent +# with the halving of the minibatch size.] + + +# _3p is the same as 3o, but after a code and script change so we can use +# natural gradient for the RepeatedAffineComponent. +# [natural gradient was helpful, based on logs; +# also made a change to use positive bias for the jesus-component affine parts.] + +# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2. + +# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support +# recurrence, with improvements to the learning of the jesus layers. + +# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found +# to be worse. +# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence +# is helpful.] +#./show_wer.sh 3g +#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys +#a03:s5c: ./show_wer.sh 2y +#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +#a03:s5c: ./show_wer.sh 3d +#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0 +#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +# _3f is as _3e, but modifying the splicing setup to add (left) recurrence: +# added the :3's in --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" +# Therefore it's +# no longer really a tdnn, more like an RNN combined with TDNN. BTW, I'm not re-dumping egs with extra +# context, and this isn't really ideal - I want to see if this seems promising first. + +# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default) +# to 200 in order to reduce computation in the Jesus layer. + +# _3d is as _2y, and re-using the egs, but using --jesus-opts and +# configs from make_jesus_configs.py. +# --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ +# --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" + +# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from +# 800k to 1.2 million. The aim is to avoid some of the per-job overhead +# (model-averaging, etc.), since each iteration takes only a minute or so. +# I added the results to the table below. It seems the same on average- +# which is good. We'll probably keep this configuration. + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# note: when I compare with the rerun of 2o (not shown), this run is actually +# better. +# WER on 2m 2o 2y [ now comparing 2o->2y:] +# train_dev,tg 17.22 17.24 16.99 0.2% better +# train_dev,fg 15.87 15.93 15.86 0.1% better +# eval2000,tg 18.7 18.7 18.9 0.2% worse +# eval2000,fg 17.0 16.9 17.0 0.1% worse + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_5c # Note: _sp will get added to this if $speed_perturb == true. + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=2.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --xent-regularize 0.05 \ + --l2-regularize 0.00005 \ + --egs-dir exp/chain/tdnn_2y_sp/egs \ + --jesus-opts "--jesus-forward-input-dim 400 --jesus-forward-output-dim 1400 --jesus-hidden-dim 7500 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \ + --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3 -3,0,3 -6,-3,0" \ + --apply-deriv-weights false \ + --frames-per-iter 1200000 \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context 20 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_5d.sh b/egs/swbd/s5c/local/chain/run_tdnn_5d.sh new file mode 100755 index 00000000000..8e6e9358003 --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_5d.sh @@ -0,0 +1,407 @@ +#!/bin/bash + +# _5d is as _5b, but increasing jesus-forward-input-dim from 500 to 600 and +# jesus-forward-output-dim from 1800 to 2000. + +# It's maybe slightly helpful: WER change is (-0.2, -0.2, 0, +0.1). +#./compare_wer.sh 5b 5d +#System 5b 5d +#WER on train_dev(tg) 15.51 15.29 +#WER on train_dev(fg) 14.39 14.17 +#WER on eval2000(tg) 17.3 17.3 +#WER on eval2000(fg) 15.6 15.7 +#Final train prob -0.112013 -0.107858 +#Final valid prob -0.130879 -0.128862 + +# _5b is as _5a, but adding --leaky-hmm-coefficient 0.1. + +# _5a is as _4w, but increasing jesus-forward-output-dim from 1400 to 1800, and +# jesus-forward-input-dim from 400 to 500. Hoping that the cross-entropy regularization +# will mean that the increased parameters are now helpful. + +# _4w is as _4v, but doubling --xent-regularize to 0.2 + +# _4v is as _4r, but with --xent-regularize 0.1. Increasing max_param_change +# from 1.0 to 2.0 because there is a lot of parameter change in the final xent +# layer, and this limits the rate of change of the other layers. + +# _4r is as _4f, but one more hidden layer, and reducing context of existing +# layers so we can re-use the egs. Reducing jesus-forward-output-dim slightly +# from 1500 to 1400. + +# This is better than 4f by almost all metrics. +# ./compare_wer.sh 4f 4r +# System 4f 4r +# WER on train_dev(tg) 16.83 16.50 +# WER on train_dev(fg) 15.73 15.45 +# WER on eval2000(tg) 18.4 18.3 +# WER on eval2000(fg) 16.6 16.7 +# Final train prob -0.105832 -0.103652 +# Final valid prob -0.123021 -0.121105 + +# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005. + +# It's even better than 4e, by about 0.3% abs. +# 4c 4e 4f +# Final valid prob: -0.1241 -0.1267 -0.1230 +# Final train prob: -0.08820 -0.1149 -0.1058 + +# ./show_wer.sh 4f +# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0 +# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 4e +# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0 +# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0 +# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys + + +# _4e is as _4c, but adding the option --l2-regularize 0.0001. + +# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000. + +# _4a is as _3s, but using narrower splice-indexes in the first layer. + +# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400. +# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p. +# This of course reduces overtraining. Results are a bit better than 3p but still +# not as good as 2y + +# ./show_wer.sh 3s +# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 3p +# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0 +# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0 +# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys +# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 2y +# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + + +# _3r is as _3p but reducing the number of parameters as it seemed to be +# overtraining (despite already being quite a small model): [600,1800 -> +# 500,1500]. Also in the interim there was a script change to +# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change. +# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent +# with the halving of the minibatch size.] + + +# _3p is the same as 3o, but after a code and script change so we can use +# natural gradient for the RepeatedAffineComponent. +# [natural gradient was helpful, based on logs; +# also made a change to use positive bias for the jesus-component affine parts.] + +# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2. + +# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support +# recurrence, with improvements to the learning of the jesus layers. + +# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found +# to be worse. +# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence +# is helpful.] +#./show_wer.sh 3g +#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys +#a03:s5c: ./show_wer.sh 2y +#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +#a03:s5c: ./show_wer.sh 3d +#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0 +#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +# _3f is as _3e, but modifying the splicing setup to add (left) recurrence: +# added the :3's in --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" +# Therefore it's +# no longer really a tdnn, more like an RNN combined with TDNN. BTW, I'm not re-dumping egs with extra +# context, and this isn't really ideal - I want to see if this seems promising first. + +# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default) +# to 200 in order to reduce computation in the Jesus layer. + +# _3d is as _2y, and re-using the egs, but using --jesus-opts and +# configs from make_jesus_configs.py. +# --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ +# --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" + +# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from +# 800k to 1.2 million. The aim is to avoid some of the per-job overhead +# (model-averaging, etc.), since each iteration takes only a minute or so. +# I added the results to the table below. It seems the same on average- +# which is good. We'll probably keep this configuration. + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# note: when I compare with the rerun of 2o (not shown), this run is actually +# better. +# WER on 2m 2o 2y [ now comparing 2o->2y:] +# train_dev,tg 17.22 17.24 16.99 0.2% better +# train_dev,fg 15.87 15.93 15.86 0.1% better +# eval2000,tg 18.7 18.7 18.9 0.2% worse +# eval2000,fg 17.0 16.9 17.0 0.1% worse + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_5d # Note: _sp will get added to this if $speed_perturb == true. + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=2.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --xent-regularize 0.2 \ + --leaky-hmm-coefficient 0.1 \ + --l2-regularize 0.00005 \ + --egs-dir exp/chain/tdnn_2y_sp/egs \ + --jesus-opts "--jesus-forward-input-dim 600 --jesus-forward-output-dim 2000 --jesus-hidden-dim 7500 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \ + --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3 -3,0,3 -6,-3,0" \ + --apply-deriv-weights false \ + --frames-per-iter 1200000 \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context 20 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_5e.sh b/egs/swbd/s5c/local/chain/run_tdnn_5e.sh new file mode 100755 index 00000000000..ed48b0673b8 --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_5e.sh @@ -0,0 +1,417 @@ +#!/bin/bash + +# _5e is as _5b, but reducing --xent-regularize from 0.2 to 0.1 (since based on +# the results of 4v, 4w and 5c, it looks like 0.1 is better than 0.2 or 0.05). + +# The improvement is small but consistent (0.1, 0.1, 0.0, 0.1) and also seen +# in the train and valid probs. +#System 5b 5e +#WER on train_dev(tg) 15.51 15.43 +#WER on train_dev(fg) 14.39 14.32 +#WER on eval2000(tg) 17.3 17.3 +#WER on eval2000(fg) 15.6 15.5 +#Final train prob -0.112013 -0.110056 +#Final valid prob -0.130879 -0.129184 + +# _5b is as _5a, but adding --leaky-hmm-coefficient 0.1. + +# It does seem helpful on average: (-0.35, -0.35, -0.1, 0). +#./compare_wer.sh 5a 5b +#System 5a 5b +#WER on train_dev(tg) 15.86 15.51 +#WER on train_dev(fg) 14.74 14.39 +#WER on eval2000(tg) 17.4 17.3 +#WER on eval2000(fg) 15.6 15.6 +#Final train prob -0.0998359 -0.112013 +#Final valid prob -0.115884 -0.130879 + +# _5a is as _4w, but increasing jesus-forward-output-dim from 1400 to 1800, and +# jesus-forward-input-dim from 400 to 500. Hoping that the cross-entropy regularization +# will mean that the increased parameters are now helpful. + +# _4w is as _4v, but doubling --xent-regularize to 0.2 + +# _4v is as _4r, but with --xent-regularize 0.1. Increasing max_param_change +# from 1.0 to 2.0 because there is a lot of parameter change in the final xent +# layer, and this limits the rate of change of the other layers. + +# _4r is as _4f, but one more hidden layer, and reducing context of existing +# layers so we can re-use the egs. Reducing jesus-forward-output-dim slightly +# from 1500 to 1400. + +# This is better than 4f by almost all metrics. +# ./compare_wer.sh 4f 4r +# System 4f 4r +# WER on train_dev(tg) 16.83 16.50 +# WER on train_dev(fg) 15.73 15.45 +# WER on eval2000(tg) 18.4 18.3 +# WER on eval2000(fg) 16.6 16.7 +# Final train prob -0.105832 -0.103652 +# Final valid prob -0.123021 -0.121105 + +# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005. + +# It's even better than 4e, by about 0.3% abs. +# 4c 4e 4f +# Final valid prob: -0.1241 -0.1267 -0.1230 +# Final train prob: -0.08820 -0.1149 -0.1058 + +# ./show_wer.sh 4f +# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0 +# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 4e +# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0 +# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0 +# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys + + +# _4e is as _4c, but adding the option --l2-regularize 0.0001. + +# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000. + +# _4a is as _3s, but using narrower splice-indexes in the first layer. + +# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400. +# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p. +# This of course reduces overtraining. Results are a bit better than 3p but still +# not as good as 2y + +# ./show_wer.sh 3s +# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 3p +# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0 +# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0 +# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys +# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 2y +# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + + +# _3r is as _3p but reducing the number of parameters as it seemed to be +# overtraining (despite already being quite a small model): [600,1800 -> +# 500,1500]. Also in the interim there was a script change to +# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change. +# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent +# with the halving of the minibatch size.] + + +# _3p is the same as 3o, but after a code and script change so we can use +# natural gradient for the RepeatedAffineComponent. +# [natural gradient was helpful, based on logs; +# also made a change to use positive bias for the jesus-component affine parts.] + +# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2. + +# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support +# recurrence, with improvements to the learning of the jesus layers. + +# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found +# to be worse. +# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence +# is helpful.] +#./show_wer.sh 3g +#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys +#a03:s5c: ./show_wer.sh 2y +#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +#a03:s5c: ./show_wer.sh 3d +#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0 +#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +# _3f is as _3e, but modifying the splicing setup to add (left) recurrence: +# added the :3's in --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" +# Therefore it's +# no longer really a tdnn, more like an RNN combined with TDNN. BTW, I'm not re-dumping egs with extra +# context, and this isn't really ideal - I want to see if this seems promising first. + +# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default) +# to 200 in order to reduce computation in the Jesus layer. + +# _3d is as _2y, and re-using the egs, but using --jesus-opts and +# configs from make_jesus_configs.py. +# --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ +# --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" + +# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from +# 800k to 1.2 million. The aim is to avoid some of the per-job overhead +# (model-averaging, etc.), since each iteration takes only a minute or so. +# I added the results to the table below. It seems the same on average- +# which is good. We'll probably keep this configuration. + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# note: when I compare with the rerun of 2o (not shown), this run is actually +# better. +# WER on 2m 2o 2y [ now comparing 2o->2y:] +# train_dev,tg 17.22 17.24 16.99 0.2% better +# train_dev,fg 15.87 15.93 15.86 0.1% better +# eval2000,tg 18.7 18.7 18.9 0.2% worse +# eval2000,fg 17.0 16.9 17.0 0.1% worse + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_5e # Note: _sp will get added to this if $speed_perturb == true. + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=2.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --xent-regularize 0.1 \ + --leaky-hmm-coefficient 0.1 \ + --l2-regularize 0.00005 \ + --egs-dir exp/chain/tdnn_2y_sp/egs \ + --jesus-opts "--jesus-forward-input-dim 500 --jesus-forward-output-dim 1800 --jesus-hidden-dim 7500 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \ + --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3 -3,0,3 -6,-3,0" \ + --apply-deriv-weights false \ + --frames-per-iter 1200000 \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context 20 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_5f.sh b/egs/swbd/s5c/local/chain/run_tdnn_5f.sh new file mode 100755 index 00000000000..5fb1f0c445c --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_5f.sh @@ -0,0 +1,423 @@ +#!/bin/bash + +# _5f is as _5e, but making the 5b->5d change (increasing the +# number of parameters)-- increasing jesus-forward-output-dim from 1800 to 2000, +# and jesus-forward-input-dim from 500 to 600. + +# WER change is (-0.1, -0.2, +0.2, +0.1). So zero on average. +# This means 5e remains the best system so far. + +#./compare_wer.sh 5e 5f +#System 5e 5f +#WER on train_dev(tg) 15.43 15.35 +#WER on train_dev(fg) 14.32 14.15 +#WER on eval2000(tg) 17.3 17.5 +#WER on eval2000(fg) 15.5 15.6 +#Final train prob -0.110056 -0.10574 +#Final valid prob -0.129184 -0.128112 + +# _5e is as _5b, but reducing --xent-regularize from 0.2 to 0.1 (since based on +# the results of 4v, 4w and 5c, it looks like 0.05 is better than 0.2 or 0.1). + +# _5b is as _5a, but adding --leaky-hmm-coefficient 0.1. + +# It does seem helpful on average: (-0.35, -0.35, -0.1, 0). +#./compare_wer.sh 5a 5b +#System 5a 5b +#WER on train_dev(tg) 15.86 15.51 +#WER on train_dev(fg) 14.74 14.39 +#WER on eval2000(tg) 17.4 17.3 +#WER on eval2000(fg) 15.6 15.6 +#Final train prob -0.0998359 -0.112013 +#Final valid prob -0.115884 -0.130879 + +# _5a is as _4w, but increasing jesus-forward-output-dim from 1400 to 1800, and +# jesus-forward-input-dim from 400 to 500. Hoping that the cross-entropy regularization +# will mean that the increased parameters are now helpful. + +# _4w is as _4v, but doubling --xent-regularize to 0.2 + +# _4v is as _4r, but with --xent-regularize 0.1. Increasing max_param_change +# from 1.0 to 2.0 because there is a lot of parameter change in the final xent +# layer, and this limits the rate of change of the other layers. + +# _4r is as _4f, but one more hidden layer, and reducing context of existing +# layers so we can re-use the egs. Reducing jesus-forward-output-dim slightly +# from 1500 to 1400. + +# This is better than 4f by almost all metrics. +# ./compare_wer.sh 4f 4r +# System 4f 4r +# WER on train_dev(tg) 16.83 16.50 +# WER on train_dev(fg) 15.73 15.45 +# WER on eval2000(tg) 18.4 18.3 +# WER on eval2000(fg) 16.6 16.7 +# Final train prob -0.105832 -0.103652 +# Final valid prob -0.123021 -0.121105 + +# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005. + +# It's even better than 4e, by about 0.3% abs. +# 4c 4e 4f +# Final valid prob: -0.1241 -0.1267 -0.1230 +# Final train prob: -0.08820 -0.1149 -0.1058 + +# ./show_wer.sh 4f +# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0 +# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 4e +# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0 +# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0 +# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys + + +# _4e is as _4c, but adding the option --l2-regularize 0.0001. + +# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000. + +# _4a is as _3s, but using narrower splice-indexes in the first layer. + +# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400. +# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p. +# This of course reduces overtraining. Results are a bit better than 3p but still +# not as good as 2y + +# ./show_wer.sh 3s +# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 3p +# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0 +# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0 +# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys +# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 2y +# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + + +# _3r is as _3p but reducing the number of parameters as it seemed to be +# overtraining (despite already being quite a small model): [600,1800 -> +# 500,1500]. Also in the interim there was a script change to +# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change. +# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent +# with the halving of the minibatch size.] + + +# _3p is the same as 3o, but after a code and script change so we can use +# natural gradient for the RepeatedAffineComponent. +# [natural gradient was helpful, based on logs; +# also made a change to use positive bias for the jesus-component affine parts.] + +# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2. + +# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support +# recurrence, with improvements to the learning of the jesus layers. + +# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found +# to be worse. +# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence +# is helpful.] +#./show_wer.sh 3g +#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys +#a03:s5c: ./show_wer.sh 2y +#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +#a03:s5c: ./show_wer.sh 3d +#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0 +#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +# _3f is as _3e, but modifying the splicing setup to add (left) recurrence: +# added the :3's in --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" +# Therefore it's +# no longer really a tdnn, more like an RNN combined with TDNN. BTW, I'm not re-dumping egs with extra +# context, and this isn't really ideal - I want to see if this seems promising first. + +# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default) +# to 200 in order to reduce computation in the Jesus layer. + +# _3d is as _2y, and re-using the egs, but using --jesus-opts and +# configs from make_jesus_configs.py. +# --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ +# --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" + +# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from +# 800k to 1.2 million. The aim is to avoid some of the per-job overhead +# (model-averaging, etc.), since each iteration takes only a minute or so. +# I added the results to the table below. It seems the same on average- +# which is good. We'll probably keep this configuration. + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# note: when I compare with the rerun of 2o (not shown), this run is actually +# better. +# WER on 2m 2o 2y [ now comparing 2o->2y:] +# train_dev,tg 17.22 17.24 16.99 0.2% better +# train_dev,fg 15.87 15.93 15.86 0.1% better +# eval2000,tg 18.7 18.7 18.9 0.2% worse +# eval2000,fg 17.0 16.9 17.0 0.1% worse + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_5f # Note: _sp will get added to this if $speed_perturb == true. + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=2.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --xent-regularize 0.1 \ + --leaky-hmm-coefficient 0.1 \ + --l2-regularize 0.00005 \ + --egs-dir exp/chain/tdnn_2y_sp/egs \ + --jesus-opts "--jesus-forward-input-dim 600 --jesus-forward-output-dim 2000 --jesus-hidden-dim 7500 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \ + --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3 -3,0,3 -6,-3,0" \ + --apply-deriv-weights false \ + --frames-per-iter 1200000 \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context 20 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_5g.sh b/egs/swbd/s5c/local/chain/run_tdnn_5g.sh new file mode 100755 index 00000000000..784facf5a82 --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_5g.sh @@ -0,0 +1,499 @@ +#!/bin/bash + +# _5g is as _5e, but adding one statistics-extraction layer to the +# splice indexes, in the middle of the network (with both mean +# and stddev). + + +# Here is decoding with --frames-per-chunk 300. A fairly consistent +# improvement. +#./compare_wer.sh 5e 5g +#System 5e 5g +#WER on train_dev(tg) 15.43 15.27 +#WER on train_dev(fg) 14.32 14.21 +#WER on eval2000(tg) 17.3 16.9 +#WER on eval2000(fg) 15.5 15.2 +#Final train prob -0.110056 -0.103752 +#Final valid prob -0.129184 -0.125641 + + +# *All results below here are broken-- they were computed when I had a bug in +# the index-permutation, and the blocks weren't computed right for the jesus +# layer.* +# Here are WERs when the frames-per-chunk was 50: +#./compare_wer.sh 5e 5g +#System 5e 5g +#WER on train_dev(tg) 15.43 15.62 +#WER on train_dev(fg) 14.32 14.42 +#WER on eval2000(tg) 17.3 17.7 +#WER on eval2000(fg) 15.5 16.0 + +# and here with 150: +# WER on train_dev(tg) 15.43 15.46 +# WER on train_dev(fg) 14.32 14.38 +# WER on eval2000(tg) 17.3 17.3 +# WER on eval2000(fg) 15.5 15.5 + + +# and here with 300 ... we do see a small improvement +# at this value. (could probably improve it further +# by modifying the model to average over a larger window). +#WER on train_dev(tg) 15.43 15.29 +#WER on train_dev(fg) 14.32 14.17 +#WER on eval2000(tg) 17.3 17.2 +#WER on eval2000(fg) 15.5 15.4 +#Final train prob -0.110056 -0.105725 +#Final valid prob -0.129184 -0.125756 + +# Below is also with chunk-size=300, but with the 'wide' model +# that sees more context. Oddly, the WER is worse. It looks like +# the model may be doing something different than just learning +# speaker characteristics. +#./compare_wer.sh 5e 5g +#System 5e 5g +#WER on train_dev(tg) 15.43 15.54 +#WER on train_dev(fg) 14.32 14.34 +#WER on eval2000(tg) 17.3 17.3 +#WER on eval2000(fg) 15.5 15.6 +#Final train prob -0.110056 -0.105725 +#Final valid prob -0.129184 -0.125756 + + + +# _5e is as _5b, but reducing --xent-regularize from 0.2 to 0.1 (since based on +# the results of 4v, 4w and 5c, it looks like 0.1 is better than 0.2 or 0.05). + +# The improvement is small but consistent (0.1, 0.1, 0.0, 0.1) and also seen +# in the train and valid probs. +#System 5b 5e +#WER on train_dev(tg) 15.51 15.43 +#WER on train_dev(fg) 14.39 14.32 +#WER on eval2000(tg) 17.3 17.3 +#WER on eval2000(fg) 15.6 15.5 +#Final train prob -0.112013 -0.110056 +#Final valid prob -0.130879 -0.129184 + +# _5b is as _5a, but adding --leaky-hmm-coefficient 0.1. + +# It does seem helpful on average: (-0.35, -0.35, -0.1, 0). +#./compare_wer.sh 5a 5b +#System 5a 5b +#WER on train_dev(tg) 15.86 15.51 +#WER on train_dev(fg) 14.74 14.39 +#WER on eval2000(tg) 17.4 17.3 +#WER on eval2000(fg) 15.6 15.6 +#Final train prob -0.0998359 -0.112013 +#Final valid prob -0.115884 -0.130879 + +# _5a is as _4w, but increasing jesus-forward-output-dim from 1400 to 1800, and +# jesus-forward-input-dim from 400 to 500. Hoping that the cross-entropy regularization +# will mean that the increased parameters are now helpful. + +# _4w is as _4v, but doubling --xent-regularize to 0.2 + +# _4v is as _4r, but with --xent-regularize 0.1. Increasing max_param_change +# from 1.0 to 2.0 because there is a lot of parameter change in the final xent +# layer, and this limits the rate of change of the other layers. + +# _4r is as _4f, but one more hidden layer, and reducing context of existing +# layers so we can re-use the egs. Reducing jesus-forward-output-dim slightly +# from 1500 to 1400. + +# This is better than 4f by almost all metrics. +# ./compare_wer.sh 4f 4r +# System 4f 4r +# WER on train_dev(tg) 16.83 16.50 +# WER on train_dev(fg) 15.73 15.45 +# WER on eval2000(tg) 18.4 18.3 +# WER on eval2000(fg) 16.6 16.7 +# Final train prob -0.105832 -0.103652 +# Final valid prob -0.123021 -0.121105 + +# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005. + +# It's even better than 4e, by about 0.3% abs. +# 4c 4e 4f +# Final valid prob: -0.1241 -0.1267 -0.1230 +# Final train prob: -0.08820 -0.1149 -0.1058 + +# ./show_wer.sh 4f +# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0 +# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 4e +# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0 +# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0 +# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys + + +# _4e is as _4c, but adding the option --l2-regularize 0.0001. + +# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000. + +# _4a is as _3s, but using narrower splice-indexes in the first layer. + +# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400. +# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p. +# This of course reduces overtraining. Results are a bit better than 3p but still +# not as good as 2y + +# ./show_wer.sh 3s +# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 3p +# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0 +# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0 +# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys +# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 2y +# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + + +# _3r is as _3p but reducing the number of parameters as it seemed to be +# overtraining (despite already being quite a small model): [600,1800 -> +# 500,1500]. Also in the interim there was a script change to +# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change. +# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent +# with the halving of the minibatch size.] + + +# _3p is the same as 3o, but after a code and script change so we can use +# natural gradient for the RepeatedAffineComponent. +# [natural gradient was helpful, based on logs; +# also made a change to use positive bias for the jesus-component affine parts.] + +# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2. + +# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support +# recurrence, with improvements to the learning of the jesus layers. + +# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found +# to be worse. +# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence +# is helpful.] +#./show_wer.sh 3g +#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys +#a03:s5c: ./show_wer.sh 2y +#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +#a03:s5c: ./show_wer.sh 3d +#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0 +#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +# _3f is as _3e, but modifying the splicing setup to add (left) recurrence: +# added the :3's in --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" +# Therefore it's +# no longer really a tdnn, more like an RNN combined with TDNN. BTW, I'm not re-dumping egs with extra +# context, and this isn't really ideal - I want to see if this seems promising first. + +# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default) +# to 200 in order to reduce computation in the Jesus layer. + +# _3d is as _2y, and re-using the egs, but using --jesus-opts and +# configs from make_jesus_configs.py. +# --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ +# --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" + +# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from +# 800k to 1.2 million. The aim is to avoid some of the per-job overhead +# (model-averaging, etc.), since each iteration takes only a minute or so. +# I added the results to the table below. It seems the same on average- +# which is good. We'll probably keep this configuration. + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# note: when I compare with the rerun of 2o (not shown), this run is actually +# better. +# WER on 2m 2o 2y [ now comparing 2o->2y:] +# train_dev,tg 17.22 17.24 16.99 0.2% better +# train_dev,fg 15.87 15.93 15.86 0.1% better +# eval2000,tg 18.7 18.7 18.9 0.2% worse +# eval2000,fg 17.0 16.9 17.0 0.1% worse + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_5g # Note: _sp will get added to this if $speed_perturb == true. + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=2.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --xent-regularize 0.1 \ + --leaky-hmm-coefficient 0.1 \ + --l2-regularize 0.00005 \ + --egs-dir exp/chain/tdnn_2y_sp/egs \ + --jesus-opts "--jesus-forward-input-dim 500 --jesus-forward-output-dim 1800 --jesus-hidden-dim 7500 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \ + --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3,mean+stddev(-99:3:9:99) -3,0,3 -6,-3,0" \ + --apply-deriv-weights false \ + --frames-per-iter 1200000 \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --frames-per-chunk 300 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi + +# if [ $stage -le 15 ]; then +# # get wide-context model +# nnet3-am-copy --binary=false $dir/final.mdl - | \ +# sed 's/Context> 99/Context> 306/g' | nnet3-am-copy - $dir/wide.mdl +# for decode_set in train_dev eval2000; do +# ( +# steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ +# --frames-per-chunk 300 --iter wide \ +# --nj 50 --cmd "$decode_cmd" \ +# --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ +# $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; +# if $has_fisher; then +# steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ +# data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ +# $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; +# fi +# ) & +# done +# fi + + +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_5h.sh b/egs/swbd/s5c/local/chain/run_tdnn_5h.sh new file mode 100755 index 00000000000..5eeb5ca5d03 --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_5h.sh @@ -0,0 +1,434 @@ +#!/bin/bash + +# _5h is as _5g, but only mean, no stddev, stats. + +# The following comparison is with 150 frames per chunk +# in both the 5g and 5h decodes. No consistent WER difference +# with either 5e or 5g. +#System 5e 5g 5h +#WER on train_dev(tg) 15.43 15.46 15.45 +#WER on train_dev(fg) 14.32 14.38 14.34 +#WER on eval2000(tg) 17.3 17.3 17.2 +#WER on eval2000(fg) 15.5 15.5 15.7 +#Final train prob -0.110056 -0.105725 -0.106213 +#Final valid prob -0.129184 -0.125756 -0.126809 + +# _5g is as _5e, but adding one statistics-extraction layer to the +# splice indexes, in the middle of the network (with both mean +# and stddev). + +# _5e is as _5b, but reducing --xent-regularize from 0.2 to 0.1 (since based on +# the results of 4v, 4w and 5c, it looks like 0.1 is better than 0.2 or 0.05). + +# The improvement is small but consistent (0.1, 0.1, 0.0, 0.1) and also seen +# in the train and valid probs. +#System 5b 5e +#WER on train_dev(tg) 15.51 15.43 +#WER on train_dev(fg) 14.39 14.32 +#WER on eval2000(tg) 17.3 17.3 +#WER on eval2000(fg) 15.6 15.5 +#Final train prob -0.112013 -0.110056 +#Final valid prob -0.130879 -0.129184 + +# _5b is as _5a, but adding --leaky-hmm-coefficient 0.1. + +# It does seem helpful on average: (-0.35, -0.35, -0.1, 0). +#./compare_wer.sh 5a 5b +#System 5a 5b +#WER on train_dev(tg) 15.86 15.51 +#WER on train_dev(fg) 14.74 14.39 +#WER on eval2000(tg) 17.4 17.3 +#WER on eval2000(fg) 15.6 15.6 +#Final train prob -0.0998359 -0.112013 +#Final valid prob -0.115884 -0.130879 + +# _5a is as _4w, but increasing jesus-forward-output-dim from 1400 to 1800, and +# jesus-forward-input-dim from 400 to 500. Hoping that the cross-entropy regularization +# will mean that the increased parameters are now helpful. + +# _4w is as _4v, but doubling --xent-regularize to 0.2 + +# _4v is as _4r, but with --xent-regularize 0.1. Increasing max_param_change +# from 1.0 to 2.0 because there is a lot of parameter change in the final xent +# layer, and this limits the rate of change of the other layers. + +# _4r is as _4f, but one more hidden layer, and reducing context of existing +# layers so we can re-use the egs. Reducing jesus-forward-output-dim slightly +# from 1500 to 1400. + +# This is better than 4f by almost all metrics. +# ./compare_wer.sh 4f 4r +# System 4f 4r +# WER on train_dev(tg) 16.83 16.50 +# WER on train_dev(fg) 15.73 15.45 +# WER on eval2000(tg) 18.4 18.3 +# WER on eval2000(fg) 16.6 16.7 +# Final train prob -0.105832 -0.103652 +# Final valid prob -0.123021 -0.121105 + +# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005. + +# It's even better than 4e, by about 0.3% abs. +# 4c 4e 4f +# Final valid prob: -0.1241 -0.1267 -0.1230 +# Final train prob: -0.08820 -0.1149 -0.1058 + +# ./show_wer.sh 4f +# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0 +# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 4e +# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0 +# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0 +# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys + + +# _4e is as _4c, but adding the option --l2-regularize 0.0001. + +# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000. + +# _4a is as _3s, but using narrower splice-indexes in the first layer. + +# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400. +# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p. +# This of course reduces overtraining. Results are a bit better than 3p but still +# not as good as 2y + +# ./show_wer.sh 3s +# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 3p +# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0 +# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0 +# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys +# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 2y +# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + + +# _3r is as _3p but reducing the number of parameters as it seemed to be +# overtraining (despite already being quite a small model): [600,1800 -> +# 500,1500]. Also in the interim there was a script change to +# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change. +# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent +# with the halving of the minibatch size.] + + +# _3p is the same as 3o, but after a code and script change so we can use +# natural gradient for the RepeatedAffineComponent. +# [natural gradient was helpful, based on logs; +# also made a change to use positive bias for the jesus-component affine parts.] + +# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2. + +# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support +# recurrence, with improvements to the learning of the jesus layers. + +# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found +# to be worse. +# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence +# is helpful.] +#./show_wer.sh 3g +#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys +#a03:s5c: ./show_wer.sh 2y +#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +#a03:s5c: ./show_wer.sh 3d +#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0 +#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +# _3f is as _3e, but modifying the splicing setup to add (left) recurrence: +# added the :3's in --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" +# Therefore it's +# no longer really a tdnn, more like an RNN combined with TDNN. BTW, I'm not re-dumping egs with extra +# context, and this isn't really ideal - I want to see if this seems promising first. + +# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default) +# to 200 in order to reduce computation in the Jesus layer. + +# _3d is as _2y, and re-using the egs, but using --jesus-opts and +# configs from make_jesus_configs.py. +# --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ +# --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" + +# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from +# 800k to 1.2 million. The aim is to avoid some of the per-job overhead +# (model-averaging, etc.), since each iteration takes only a minute or so. +# I added the results to the table below. It seems the same on average- +# which is good. We'll probably keep this configuration. + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# note: when I compare with the rerun of 2o (not shown), this run is actually +# better. +# WER on 2m 2o 2y [ now comparing 2o->2y:] +# train_dev,tg 17.22 17.24 16.99 0.2% better +# train_dev,fg 15.87 15.93 15.86 0.1% better +# eval2000,tg 18.7 18.7 18.9 0.2% worse +# eval2000,fg 17.0 16.9 17.0 0.1% worse + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_5h # Note: _sp will get added to this if $speed_perturb == true. + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=2.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --xent-regularize 0.1 \ + --leaky-hmm-coefficient 0.1 \ + --l2-regularize 0.00005 \ + --egs-dir exp/chain/tdnn_2y_sp/egs \ + --jesus-opts "--jesus-forward-input-dim 500 --jesus-forward-output-dim 1800 --jesus-hidden-dim 7500 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \ + --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3,mean(-99:3:9:99) -3,0,3 -6,-3,0" \ + --apply-deriv-weights false \ + --frames-per-iter 1200000 \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --frames-per-chunk 150 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_5i.sh b/egs/swbd/s5c/local/chain/run_tdnn_5i.sh new file mode 100755 index 00000000000..9ffc37793ee --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_5i.sh @@ -0,0 +1,432 @@ +#!/bin/bash + +# _5i is as _5g, but adding the mean+stddev features for all hidden layers. +# a little worse than 5g (but for Remi Francis it was a little better). +#local/chain/compare_wer.sh 5e 5g 5i +#System 5e 5g 5i +#WER on train_dev(tg) 15.43 15.27 15.41 +#WER on train_dev(fg) 14.32 14.21 14.47 +#WER on eval2000(tg) 17.3 16.9 17.0 +#WER on eval2000(fg) 15.5 15.2 15.4 +#Final train prob -0.110056 -0.103752 -0.102539 +#Final valid prob -0.129184 -0.125641 -0.12375 + +# _5g is as _5e, but adding one statistics-extraction layer to the +# splice indexes, in the middle of the network (with both mean +# and stddev). + +# _5e is as _5b, but reducing --xent-regularize from 0.2 to 0.1 (since based on +# the results of 4v, 4w and 5c, it looks like 0.1 is better than 0.2 or 0.05). + +# The improvement is small but consistent (0.1, 0.1, 0.0, 0.1) and also seen +# in the train and valid probs. +#System 5b 5e +#WER on train_dev(tg) 15.51 15.43 +#WER on train_dev(fg) 14.39 14.32 +#WER on eval2000(tg) 17.3 17.3 +#WER on eval2000(fg) 15.6 15.5 +#Final train prob -0.112013 -0.110056 +#Final valid prob -0.130879 -0.129184 + +# _5b is as _5a, but adding --leaky-hmm-coefficient 0.1. + +# It does seem helpful on average: (-0.35, -0.35, -0.1, 0). +#./compare_wer.sh 5a 5b +#System 5a 5b +#WER on train_dev(tg) 15.86 15.51 +#WER on train_dev(fg) 14.74 14.39 +#WER on eval2000(tg) 17.4 17.3 +#WER on eval2000(fg) 15.6 15.6 +#Final train prob -0.0998359 -0.112013 +#Final valid prob -0.115884 -0.130879 + +# _5a is as _4w, but increasing jesus-forward-output-dim from 1400 to 1800, and +# jesus-forward-input-dim from 400 to 500. Hoping that the cross-entropy regularization +# will mean that the increased parameters are now helpful. + +# _4w is as _4v, but doubling --xent-regularize to 0.2 + +# _4v is as _4r, but with --xent-regularize 0.1. Increasing max_param_change +# from 1.0 to 2.0 because there is a lot of parameter change in the final xent +# layer, and this limits the rate of change of the other layers. + +# _4r is as _4f, but one more hidden layer, and reducing context of existing +# layers so we can re-use the egs. Reducing jesus-forward-output-dim slightly +# from 1500 to 1400. + +# This is better than 4f by almost all metrics. +# ./compare_wer.sh 4f 4r +# System 4f 4r +# WER on train_dev(tg) 16.83 16.50 +# WER on train_dev(fg) 15.73 15.45 +# WER on eval2000(tg) 18.4 18.3 +# WER on eval2000(fg) 16.6 16.7 +# Final train prob -0.105832 -0.103652 +# Final valid prob -0.123021 -0.121105 + +# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005. + +# It's even better than 4e, by about 0.3% abs. +# 4c 4e 4f +# Final valid prob: -0.1241 -0.1267 -0.1230 +# Final train prob: -0.08820 -0.1149 -0.1058 + +# ./show_wer.sh 4f +# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0 +# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 4e +# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0 +# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0 +# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys + + +# _4e is as _4c, but adding the option --l2-regularize 0.0001. + +# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000. + +# _4a is as _3s, but using narrower splice-indexes in the first layer. + +# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400. +# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p. +# This of course reduces overtraining. Results are a bit better than 3p but still +# not as good as 2y + +# ./show_wer.sh 3s +# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 3p +# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0 +# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0 +# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys +# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 2y +# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + + +# _3r is as _3p but reducing the number of parameters as it seemed to be +# overtraining (despite already being quite a small model): [600,1800 -> +# 500,1500]. Also in the interim there was a script change to +# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change. +# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent +# with the halving of the minibatch size.] + + +# _3p is the same as 3o, but after a code and script change so we can use +# natural gradient for the RepeatedAffineComponent. +# [natural gradient was helpful, based on logs; +# also made a change to use positive bias for the jesus-component affine parts.] + +# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2. + +# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support +# recurrence, with improvements to the learning of the jesus layers. + +# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found +# to be worse. +# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence +# is helpful.] +#./show_wer.sh 3g +#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys +#a03:s5c: ./show_wer.sh 2y +#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +#a03:s5c: ./show_wer.sh 3d +#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0 +#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +# _3f is as _3e, but modifying the splicing setup to add (left) recurrence: +# added the :3's in --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" +# Therefore it's +# no longer really a tdnn, more like an RNN combined with TDNN. BTW, I'm not re-dumping egs with extra +# context, and this isn't really ideal - I want to see if this seems promising first. + +# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default) +# to 200 in order to reduce computation in the Jesus layer. + +# _3d is as _2y, and re-using the egs, but using --jesus-opts and +# configs from make_jesus_configs.py. +# --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ +# --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" + +# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from +# 800k to 1.2 million. The aim is to avoid some of the per-job overhead +# (model-averaging, etc.), since each iteration takes only a minute or so. +# I added the results to the table below. It seems the same on average- +# which is good. We'll probably keep this configuration. + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# note: when I compare with the rerun of 2o (not shown), this run is actually +# better. +# WER on 2m 2o 2y [ now comparing 2o->2y:] +# train_dev,tg 17.22 17.24 16.99 0.2% better +# train_dev,fg 15.87 15.93 15.86 0.1% better +# eval2000,tg 18.7 18.7 18.9 0.2% worse +# eval2000,fg 17.0 16.9 17.0 0.1% worse + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_5i # Note: _sp will get added to this if $speed_perturb == true. + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=2.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --xent-regularize 0.1 \ + --leaky-hmm-coefficient 0.1 \ + --l2-regularize 0.00005 \ + --egs-dir exp/chain/tdnn_2y_sp/egs \ + --jesus-opts "--jesus-forward-input-dim 500 --jesus-forward-output-dim 1800 --jesus-hidden-dim 7500 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \ + --splice-indexes "-1,0,1 -1,0,1,2,mean+stddev(-99:1:9:99) -3,0,3,mean+stddev(-99:3:9:99) -3,0,3,mean+stddev(-99:3:9:99) -3,0,3,mean+stddev(-99:3:9:99) -6,-3,0,mean+stddev(-99:3:9:99)" \ + --apply-deriv-weights false \ + --frames-per-iter 1200000 \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --frames-per-chunk 150 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_5j.sh b/egs/swbd/s5c/local/chain/run_tdnn_5j.sh new file mode 100755 index 00000000000..892a79fd2a8 --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_5j.sh @@ -0,0 +1,427 @@ +#!/bin/bash + +# _5j is as _5e, but omitting the iVectors. + +# Definitely worse, although curiously, there is very little effect on the valid prob. +#./compare_wer.sh 5e 5j +#System 5e 5j +#WER on train_dev(tg) 15.43 17.59 +#WER on train_dev(fg) 14.32 16.33 +#WER on eval2000(tg) 17.3 19.1 +#WER on eval2000(fg) 15.5 17.5 +#Final train prob -0.110056 -0.114691 +#Final valid prob -0.129184 -0.130761 + + +# _5e is as _5b, but reducing --xent-regularize from 0.2 to 0.1 (since based on +# the results of 4v, 4w and 5c, it looks like 0.1 is better than 0.2 or 0.05). + +# The improvement is small but consistent (0.1, 0.1, 0.0, 0.1) and also seen +# in the train and valid probs. +#System 5b 5e +#WER on train_dev(tg) 15.51 15.43 +#WER on train_dev(fg) 14.39 14.32 +#WER on eval2000(tg) 17.3 17.3 +#WER on eval2000(fg) 15.6 15.5 +#Final train prob -0.112013 -0.110056 +#Final valid prob -0.130879 -0.129184 + +# _5b is as _5a, but adding --leaky-hmm-coefficient 0.1. + +# It does seem helpful on average: (-0.35, -0.35, -0.1, 0). +#./compare_wer.sh 5a 5b +#System 5a 5b +#WER on train_dev(tg) 15.86 15.51 +#WER on train_dev(fg) 14.74 14.39 +#WER on eval2000(tg) 17.4 17.3 +#WER on eval2000(fg) 15.6 15.6 +#Final train prob -0.0998359 -0.112013 +#Final valid prob -0.115884 -0.130879 + +# _5a is as _4w, but increasing jesus-forward-output-dim from 1400 to 1800, and +# jesus-forward-input-dim from 400 to 500. Hoping that the cross-entropy regularization +# will mean that the increased parameters are now helpful. + +# _4w is as _4v, but doubling --xent-regularize to 0.2 + +# _4v is as _4r, but with --xent-regularize 0.1. Increasing max_param_change +# from 1.0 to 2.0 because there is a lot of parameter change in the final xent +# layer, and this limits the rate of change of the other layers. + +# _4r is as _4f, but one more hidden layer, and reducing context of existing +# layers so we can re-use the egs. Reducing jesus-forward-output-dim slightly +# from 1500 to 1400. + +# This is better than 4f by almost all metrics. +# ./compare_wer.sh 4f 4r +# System 4f 4r +# WER on train_dev(tg) 16.83 16.50 +# WER on train_dev(fg) 15.73 15.45 +# WER on eval2000(tg) 18.4 18.3 +# WER on eval2000(fg) 16.6 16.7 +# Final train prob -0.105832 -0.103652 +# Final valid prob -0.123021 -0.121105 + +# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005. + +# It's even better than 4e, by about 0.3% abs. +# 4c 4e 4f +# Final valid prob: -0.1241 -0.1267 -0.1230 +# Final train prob: -0.08820 -0.1149 -0.1058 + +# ./show_wer.sh 4f +# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0 +# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 4e +# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0 +# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0 +# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys + + +# _4e is as _4c, but adding the option --l2-regularize 0.0001. + +# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000. + +# _4a is as _3s, but using narrower splice-indexes in the first layer. + +# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400. +# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p. +# This of course reduces overtraining. Results are a bit better than 3p but still +# not as good as 2y + +# ./show_wer.sh 3s +# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 3p +# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0 +# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0 +# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys +# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 2y +# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + + +# _3r is as _3p but reducing the number of parameters as it seemed to be +# overtraining (despite already being quite a small model): [600,1800 -> +# 500,1500]. Also in the interim there was a script change to +# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change. +# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent +# with the halving of the minibatch size.] + + +# _3p is the same as 3o, but after a code and script change so we can use +# natural gradient for the RepeatedAffineComponent. +# [natural gradient was helpful, based on logs; +# also made a change to use positive bias for the jesus-component affine parts.] + +# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2. + +# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support +# recurrence, with improvements to the learning of the jesus layers. + +# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found +# to be worse. +# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence +# is helpful.] +#./show_wer.sh 3g +#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys +#a03:s5c: ./show_wer.sh 2y +#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +#a03:s5c: ./show_wer.sh 3d +#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0 +#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +# _3f is as _3e, but modifying the splicing setup to add (left) recurrence: +# added the :3's in --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" +# Therefore it's +# no longer really a tdnn, more like an RNN combined with TDNN. BTW, I'm not re-dumping egs with extra +# context, and this isn't really ideal - I want to see if this seems promising first. + +# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default) +# to 200 in order to reduce computation in the Jesus layer. + +# _3d is as _2y, and re-using the egs, but using --jesus-opts and +# configs from make_jesus_configs.py. +# --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ +# --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" + +# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from +# 800k to 1.2 million. The aim is to avoid some of the per-job overhead +# (model-averaging, etc.), since each iteration takes only a minute or so. +# I added the results to the table below. It seems the same on average- +# which is good. We'll probably keep this configuration. + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# note: when I compare with the rerun of 2o (not shown), this run is actually +# better. +# WER on 2m 2o 2y [ now comparing 2o->2y:] +# train_dev,tg 17.22 17.24 16.99 0.2% better +# train_dev,fg 15.87 15.93 15.86 0.1% better +# eval2000,tg 18.7 18.7 18.9 0.2% worse +# eval2000,fg 17.0 16.9 17.0 0.1% worse + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_5j # Note: _sp will get added to this if $speed_perturb == true. + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=2.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --xent-regularize 0.1 \ + --leaky-hmm-coefficient 0.1 \ + --l2-regularize 0.00005 \ + --jesus-opts "--jesus-forward-input-dim 500 --jesus-forward-output-dim 1800 --jesus-hidden-dim 7500 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \ + --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3 -3,0,3 -6,-3,0" \ + --apply-deriv-weights false \ + --frames-per-iter 1200000 \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --feat-type raw \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context 20 \ + --nj 50 --cmd "$decode_cmd" \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_5k.sh b/egs/swbd/s5c/local/chain/run_tdnn_5k.sh new file mode 100755 index 00000000000..b6c984ed253 --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_5k.sh @@ -0,0 +1,454 @@ +#!/bin/bash + +# _5k is as _5j (omitting iVectors), and adding a statistics-extraction layer +# in the middle, like 5e->5g, to see whether it recovers some of the improvement +# of using the iVectors. + +# It recovers half of the improvement-- but the objf is better than +# we might expect. I think it's learning some phonetic stuff too. +# +#./compare_wer.sh 5e 5j 5k +#System 5e 5j 5k +#WER on train_dev(tg) 15.43 17.59 16.46 +#WER on train_dev(fg) 14.32 16.33 15.17 +#WER on eval2000(tg) 17.3 19.1 18.1 +#WER on eval2000(fg) 15.5 17.5 16.5 +#Final train prob -0.110056 -0.114691 -0.105502 +#Final valid prob -0.129184 -0.130761 -0.12337 + +# The following is decoding with the default frames-per-chunk of 50, and +# --extra-left-context 20. +#./compare_wer.sh 5e 5j 5k +#System 5e 5j 5k +#WER on train_dev(tg) 15.43 17.59 17.37 +#WER on train_dev(fg) 14.32 16.33 16.09 +#WER on eval2000(tg) 17.3 19.1 18.8 +#WER on eval2000(fg) 15.5 17.5 17.3 +#Final train prob -0.110056 -0.114691 -0.105502 +#Final valid prob -0.129184 -0.130761 -0.12337 + +# _5j is as _5e, but omitting the iVectors. + +# Definitely worse, although curiously, there is very little effect on the valid prob. +#./compare_wer.sh 5e 5j +#System 5e 5j +#WER on train_dev(tg) 15.43 17.59 +#WER on train_dev(fg) 14.32 16.33 +#WER on eval2000(tg) 17.3 19.1 +#WER on eval2000(fg) 15.5 17.5 +#Final train prob -0.110056 -0.114691 +#Final valid prob -0.129184 -0.130761 + +# _5e is as _5b, but reducing --xent-regularize from 0.2 to 0.1 (since based on +# the results of 4v, 4w and 5c, it looks like 0.1 is better than 0.2 or 0.05). + +# The improvement is small but consistent (0.1, 0.1, 0.0, 0.1) and also seen +# in the train and valid probs. +#System 5b 5e +#WER on train_dev(tg) 15.51 15.43 +#WER on train_dev(fg) 14.39 14.32 +#WER on eval2000(tg) 17.3 17.3 +#WER on eval2000(fg) 15.6 15.5 +#Final train prob -0.112013 -0.110056 +#Final valid prob -0.130879 -0.129184 + +# _5b is as _5a, but adding --leaky-hmm-coefficient 0.1. + +# It does seem helpful on average: (-0.35, -0.35, -0.1, 0). +#./compare_wer.sh 5a 5b +#System 5a 5b +#WER on train_dev(tg) 15.86 15.51 +#WER on train_dev(fg) 14.74 14.39 +#WER on eval2000(tg) 17.4 17.3 +#WER on eval2000(fg) 15.6 15.6 +#Final train prob -0.0998359 -0.112013 +#Final valid prob -0.115884 -0.130879 + +# _5a is as _4w, but increasing jesus-forward-output-dim from 1400 to 1800, and +# jesus-forward-input-dim from 400 to 500. Hoping that the cross-entropy regularization +# will mean that the increased parameters are now helpful. + +# _4w is as _4v, but doubling --xent-regularize to 0.2 + +# _4v is as _4r, but with --xent-regularize 0.1. Increasing max_param_change +# from 1.0 to 2.0 because there is a lot of parameter change in the final xent +# layer, and this limits the rate of change of the other layers. + +# _4r is as _4f, but one more hidden layer, and reducing context of existing +# layers so we can re-use the egs. Reducing jesus-forward-output-dim slightly +# from 1500 to 1400. + +# This is better than 4f by almost all metrics. +# ./compare_wer.sh 4f 4r +# System 4f 4r +# WER on train_dev(tg) 16.83 16.50 +# WER on train_dev(fg) 15.73 15.45 +# WER on eval2000(tg) 18.4 18.3 +# WER on eval2000(fg) 16.6 16.7 +# Final train prob -0.105832 -0.103652 +# Final valid prob -0.123021 -0.121105 + +# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005. + +# It's even better than 4e, by about 0.3% abs. +# 4c 4e 4f +# Final valid prob: -0.1241 -0.1267 -0.1230 +# Final train prob: -0.08820 -0.1149 -0.1058 + +# ./show_wer.sh 4f +# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0 +# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 4e +# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0 +# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0 +# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys + + +# _4e is as _4c, but adding the option --l2-regularize 0.0001. + +# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000. + +# _4a is as _3s, but using narrower splice-indexes in the first layer. + +# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400. +# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p. +# This of course reduces overtraining. Results are a bit better than 3p but still +# not as good as 2y + +# ./show_wer.sh 3s +# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 3p +# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0 +# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0 +# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys +# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 2y +# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + + +# _3r is as _3p but reducing the number of parameters as it seemed to be +# overtraining (despite already being quite a small model): [600,1800 -> +# 500,1500]. Also in the interim there was a script change to +# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change. +# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent +# with the halving of the minibatch size.] + + +# _3p is the same as 3o, but after a code and script change so we can use +# natural gradient for the RepeatedAffineComponent. +# [natural gradient was helpful, based on logs; +# also made a change to use positive bias for the jesus-component affine parts.] + +# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2. + +# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support +# recurrence, with improvements to the learning of the jesus layers. + +# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found +# to be worse. +# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence +# is helpful.] +#./show_wer.sh 3g +#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys +#a03:s5c: ./show_wer.sh 2y +#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +#a03:s5c: ./show_wer.sh 3d +#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0 +#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +# _3f is as _3e, but modifying the splicing setup to add (left) recurrence: +# added the :3's in --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" +# Therefore it's +# no longer really a tdnn, more like an RNN combined with TDNN. BTW, I'm not re-dumping egs with extra +# context, and this isn't really ideal - I want to see if this seems promising first. + +# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default) +# to 200 in order to reduce computation in the Jesus layer. + +# _3d is as _2y, and re-using the egs, but using --jesus-opts and +# configs from make_jesus_configs.py. +# --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ +# --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" + +# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from +# 800k to 1.2 million. The aim is to avoid some of the per-job overhead +# (model-averaging, etc.), since each iteration takes only a minute or so. +# I added the results to the table below. It seems the same on average- +# which is good. We'll probably keep this configuration. + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# note: when I compare with the rerun of 2o (not shown), this run is actually +# better. +# WER on 2m 2o 2y [ now comparing 2o->2y:] +# train_dev,tg 17.22 17.24 16.99 0.2% better +# train_dev,fg 15.87 15.93 15.86 0.1% better +# eval2000,tg 18.7 18.7 18.9 0.2% worse +# eval2000,fg 17.0 16.9 17.0 0.1% worse + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_5k # Note: _sp will get added to this if $speed_perturb == true. + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=2.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --egs-dir exp/chain/tdnn_5j_sp/egs \ + --xent-regularize 0.1 \ + --leaky-hmm-coefficient 0.1 \ + --l2-regularize 0.00005 \ + --jesus-opts "--jesus-forward-input-dim 500 --jesus-forward-output-dim 1800 --jesus-hidden-dim 7500 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \ + --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3,mean+stddev(-99:3:9:99) -3,0,3 -6,-3,0" \ + --apply-deriv-weights false \ + --frames-per-iter 1200000 \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --feat-type raw \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --frames-per-chunk 300 \ + --nj 50 --cmd "$decode_cmd" \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_5l.sh b/egs/swbd/s5c/local/chain/run_tdnn_5l.sh new file mode 100755 index 00000000000..d5b51eb7551 --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_5l.sh @@ -0,0 +1,464 @@ +#!/bin/bash + +# _5l is as _5k, but doubling frames-per-eg from 150 to 300, and increasing +# the context radius of the statistics-pooling from 99 to 153. + +# :-( No better than 5k.) +#./compare_wer.sh 5e 5j 5k 5l +#System 5e 5j 5k 5l +#WER on train_dev(tg) 15.43 17.59 16.46 16.68 +#WER on train_dev(fg) 14.32 16.33 15.17 15.40 +#WER on eval2000(tg) 17.3 19.1 18.1 18.3 +#WER on eval2000(fg) 15.5 17.5 16.5 16.5 +#Final train prob -0.110056 -0.114691 -0.105502-0.0804455 +#Final valid prob -0.129184 -0.130761 -0.12337 -0.10712 + +# _5k is as _5j (omitting iVectors), and adding a statistics-extraction layer +# in the middle, like 5e->5g, to see whether it recovers some of the improvement +# of using the iVectors. + +# It recovers half of the improvement-- but the objf is better than +# we might expect. I think it's learning some phonetic stuff too. +# +#./compare_wer.sh 5e 5j 5k +#System 5e 5j 5k +#WER on train_dev(tg) 15.43 17.59 16.46 +#WER on train_dev(fg) 14.32 16.33 15.17 +#WER on eval2000(tg) 17.3 19.1 18.1 +#WER on eval2000(fg) 15.5 17.5 16.5 +#Final train prob -0.110056 -0.114691 -0.105502 +#Final valid prob -0.129184 -0.130761 -0.12337 + +# The following is decoding with the default frames-per-chunk of 50, and +# --extra-left-context 20. +#./compare_wer.sh 5e 5j 5k +#System 5e 5j 5k +#WER on train_dev(tg) 15.43 17.59 17.37 +#WER on train_dev(fg) 14.32 16.33 16.09 +#WER on eval2000(tg) 17.3 19.1 18.8 +#WER on eval2000(fg) 15.5 17.5 17.3 +#Final train prob -0.110056 -0.114691 -0.105502 +#Final valid prob -0.129184 -0.130761 -0.12337 + +# _5j is as _5e, but omitting the iVectors. + +# Definitely worse, although curiously, there is very little effect on the valid prob. +#./compare_wer.sh 5e 5j +#System 5e 5j +#WER on train_dev(tg) 15.43 17.59 +#WER on train_dev(fg) 14.32 16.33 +#WER on eval2000(tg) 17.3 19.1 +#WER on eval2000(fg) 15.5 17.5 +#Final train prob -0.110056 -0.114691 +#Final valid prob -0.129184 -0.130761 + +# _5e is as _5b, but reducing --xent-regularize from 0.2 to 0.1 (since based on +# the results of 4v, 4w and 5c, it looks like 0.1 is better than 0.2 or 0.05). + +# The improvement is small but consistent (0.1, 0.1, 0.0, 0.1) and also seen +# in the train and valid probs. +#System 5b 5e +#WER on train_dev(tg) 15.51 15.43 +#WER on train_dev(fg) 14.39 14.32 +#WER on eval2000(tg) 17.3 17.3 +#WER on eval2000(fg) 15.6 15.5 +#Final train prob -0.112013 -0.110056 +#Final valid prob -0.130879 -0.129184 + +# _5b is as _5a, but adding --leaky-hmm-coefficient 0.1. + +# It does seem helpful on average: (-0.35, -0.35, -0.1, 0). +#./compare_wer.sh 5a 5b +#System 5a 5b +#WER on train_dev(tg) 15.86 15.51 +#WER on train_dev(fg) 14.74 14.39 +#WER on eval2000(tg) 17.4 17.3 +#WER on eval2000(fg) 15.6 15.6 +#Final train prob -0.0998359 -0.112013 +#Final valid prob -0.115884 -0.130879 + +# _5a is as _4w, but increasing jesus-forward-output-dim from 1400 to 1800, and +# jesus-forward-input-dim from 400 to 500. Hoping that the cross-entropy regularization +# will mean that the increased parameters are now helpful. + +# _4w is as _4v, but doubling --xent-regularize to 0.2 + +# _4v is as _4r, but with --xent-regularize 0.1. Increasing max_param_change +# from 1.0 to 2.0 because there is a lot of parameter change in the final xent +# layer, and this limits the rate of change of the other layers. + +# _4r is as _4f, but one more hidden layer, and reducing context of existing +# layers so we can re-use the egs. Reducing jesus-forward-output-dim slightly +# from 1500 to 1400. + +# This is better than 4f by almost all metrics. +# ./compare_wer.sh 4f 4r +# System 4f 4r +# WER on train_dev(tg) 16.83 16.50 +# WER on train_dev(fg) 15.73 15.45 +# WER on eval2000(tg) 18.4 18.3 +# WER on eval2000(fg) 16.6 16.7 +# Final train prob -0.105832 -0.103652 +# Final valid prob -0.123021 -0.121105 + +# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005. + +# It's even better than 4e, by about 0.3% abs. +# 4c 4e 4f +# Final valid prob: -0.1241 -0.1267 -0.1230 +# Final train prob: -0.08820 -0.1149 -0.1058 + +# ./show_wer.sh 4f +# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0 +# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 4e +# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0 +# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0 +# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys + + +# _4e is as _4c, but adding the option --l2-regularize 0.0001. + +# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000. + +# _4a is as _3s, but using narrower splice-indexes in the first layer. + +# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400. +# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p. +# This of course reduces overtraining. Results are a bit better than 3p but still +# not as good as 2y + +# ./show_wer.sh 3s +# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 3p +# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0 +# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0 +# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys +# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 2y +# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + + +# _3r is as _3p but reducing the number of parameters as it seemed to be +# overtraining (despite already being quite a small model): [600,1800 -> +# 500,1500]. Also in the interim there was a script change to +# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change. +# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent +# with the halving of the minibatch size.] + + +# _3p is the same as 3o, but after a code and script change so we can use +# natural gradient for the RepeatedAffineComponent. +# [natural gradient was helpful, based on logs; +# also made a change to use positive bias for the jesus-component affine parts.] + +# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2. + +# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support +# recurrence, with improvements to the learning of the jesus layers. + +# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found +# to be worse. +# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence +# is helpful.] +#./show_wer.sh 3g +#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys +#a03:s5c: ./show_wer.sh 2y +#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +#a03:s5c: ./show_wer.sh 3d +#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0 +#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +# _3f is as _3e, but modifying the splicing setup to add (left) recurrence: +# added the :3's in --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" +# Therefore it's +# no longer really a tdnn, more like an RNN combined with TDNN. BTW, I'm not re-dumping egs with extra +# context, and this isn't really ideal - I want to see if this seems promising first. + +# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default) +# to 200 in order to reduce computation in the Jesus layer. + +# _3d is as _2y, and re-using the egs, but using --jesus-opts and +# configs from make_jesus_configs.py. +# --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ +# --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" + +# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from +# 800k to 1.2 million. The aim is to avoid some of the per-job overhead +# (model-averaging, etc.), since each iteration takes only a minute or so. +# I added the results to the table below. It seems the same on average- +# which is good. We'll probably keep this configuration. + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# note: when I compare with the rerun of 2o (not shown), this run is actually +# better. +# WER on 2m 2o 2y [ now comparing 2o->2y:] +# train_dev,tg 17.22 17.24 16.99 0.2% better +# train_dev,fg 15.87 15.93 15.86 0.1% better +# eval2000,tg 18.7 18.7 18.9 0.2% worse +# eval2000,fg 17.0 16.9 17.0 0.1% worse + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_5l # Note: _sp will get added to this if $speed_perturb == true. + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=1.414 # was 2; now 2 / sqrt(2) = sqrt(2), since we're using half the minibatch size. +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --xent-regularize 0.1 \ + --leaky-hmm-coefficient 0.1 \ + --l2-regularize 0.00005 \ + --frames-per-eg 300 \ + --jesus-opts "--jesus-forward-input-dim 500 --jesus-forward-output-dim 1800 --jesus-hidden-dim 7500 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \ + --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3,mean+stddev(-153:3:9:153) -3,0,3 -6,-3,0" \ + --apply-deriv-weights false \ + --frames-per-iter 1200000 \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size 64 \ + --egs-opts "--frames-overlap-per-eg 0" \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --feat-type raw \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --frames-per-chunk 300 \ + --nj 50 --cmd "$decode_cmd" \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_5m.sh b/egs/swbd/s5c/local/chain/run_tdnn_5m.sh new file mode 100644 index 00000000000..a9e12357c23 --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_5m.sh @@ -0,0 +1,430 @@ +#!/bin/bash + +# _5m is as _5e, but with a script change where we are randomizing +# the frame shift a bit better. + +# No very clear change, but if anything the optimization is less effective +# and the WER worse -> I'm going to revert this script change. +#System 5e 5m +#WER on train_dev(tg) 15.43 15.57 +#WER on train_dev(fg) 14.32 14.47 +#WER on eval2000(tg) 17.3 17.2 +#WER on eval2000(fg) 15.5 15.7 +#Final train prob -0.110056 -0.112539 +#Final valid prob -0.129184 -0.129006 + +# _5e is as _5b, but reducing --xent-regularize from 0.2 to 0.1 (since based on +# the results of 4v, 4w and 5c, it looks like 0.1 is better than 0.2 or 0.05). + +# The improvement is small but consistent (0.1, 0.1, 0.0, 0.1) and also seen +# in the train and valid probs. +#System 5b 5e +#WER on train_dev(tg) 15.51 15.43 +#WER on train_dev(fg) 14.39 14.32 +#WER on eval2000(tg) 17.3 17.3 +#WER on eval2000(fg) 15.6 15.5 +#Final train prob -0.112013 -0.110056 +#Final valid prob -0.130879 -0.129184 + +# _5b is as _5a, but adding --leaky-hmm-coefficient 0.1. + +# It does seem helpful on average: (-0.35, -0.35, -0.1, 0). +#./compare_wer.sh 5a 5b +#System 5a 5b +#WER on train_dev(tg) 15.86 15.51 +#WER on train_dev(fg) 14.74 14.39 +#WER on eval2000(tg) 17.4 17.3 +#WER on eval2000(fg) 15.6 15.6 +#Final train prob -0.0998359 -0.112013 +#Final valid prob -0.115884 -0.130879 + +# _5a is as _4w, but increasing jesus-forward-output-dim from 1400 to 1800, and +# jesus-forward-input-dim from 400 to 500. Hoping that the cross-entropy regularization +# will mean that the increased parameters are now helpful. + +# _4w is as _4v, but doubling --xent-regularize to 0.2 + +# _4v is as _4r, but with --xent-regularize 0.1. Increasing max_param_change +# from 1.0 to 2.0 because there is a lot of parameter change in the final xent +# layer, and this limits the rate of change of the other layers. + +# _4r is as _4f, but one more hidden layer, and reducing context of existing +# layers so we can re-use the egs. Reducing jesus-forward-output-dim slightly +# from 1500 to 1400. + +# This is better than 4f by almost all metrics. +# ./compare_wer.sh 4f 4r +# System 4f 4r +# WER on train_dev(tg) 16.83 16.50 +# WER on train_dev(fg) 15.73 15.45 +# WER on eval2000(tg) 18.4 18.3 +# WER on eval2000(fg) 16.6 16.7 +# Final train prob -0.105832 -0.103652 +# Final valid prob -0.123021 -0.121105 + +# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005. + +# It's even better than 4e, by about 0.3% abs. +# 4c 4e 4f +# Final valid prob: -0.1241 -0.1267 -0.1230 +# Final train prob: -0.08820 -0.1149 -0.1058 + +# ./show_wer.sh 4f +# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0 +# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 4e +# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0 +# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0 +# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys + + +# _4e is as _4c, but adding the option --l2-regularize 0.0001. + +# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000. + +# _4a is as _3s, but using narrower splice-indexes in the first layer. + +# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400. +# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p. +# This of course reduces overtraining. Results are a bit better than 3p but still +# not as good as 2y + +# ./show_wer.sh 3s +# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 3p +# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0 +# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0 +# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys +# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 2y +# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + + +# _3r is as _3p but reducing the number of parameters as it seemed to be +# overtraining (despite already being quite a small model): [600,1800 -> +# 500,1500]. Also in the interim there was a script change to +# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change. +# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent +# with the halving of the minibatch size.] + + +# _3p is the same as 3o, but after a code and script change so we can use +# natural gradient for the RepeatedAffineComponent. +# [natural gradient was helpful, based on logs; +# also made a change to use positive bias for the jesus-component affine parts.] + +# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2. + +# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support +# recurrence, with improvements to the learning of the jesus layers. + +# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found +# to be worse. +# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence +# is helpful.] +#./show_wer.sh 3g +#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys +#a03:s5c: ./show_wer.sh 2y +#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +#a03:s5c: ./show_wer.sh 3d +#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0 +#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +# _3f is as _3e, but modifying the splicing setup to add (left) recurrence: +# added the :3's in --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" +# Therefore it's +# no longer really a tdnn, more like an RNN combined with TDNN. BTW, I'm not re-dumping egs with extra +# context, and this isn't really ideal - I want to see if this seems promising first. + +# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default) +# to 200 in order to reduce computation in the Jesus layer. + +# _3d is as _2y, and re-using the egs, but using --jesus-opts and +# configs from make_jesus_configs.py. +# --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ +# --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" + +# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from +# 800k to 1.2 million. The aim is to avoid some of the per-job overhead +# (model-averaging, etc.), since each iteration takes only a minute or so. +# I added the results to the table below. It seems the same on average- +# which is good. We'll probably keep this configuration. + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# note: when I compare with the rerun of 2o (not shown), this run is actually +# better. +# WER on 2m 2o 2y [ now comparing 2o->2y:] +# train_dev,tg 17.22 17.24 16.99 0.2% better +# train_dev,fg 15.87 15.93 15.86 0.1% better +# eval2000,tg 18.7 18.7 18.9 0.2% worse +# eval2000,fg 17.0 16.9 17.0 0.1% worse + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_5m # Note: _sp will get added to this if $speed_perturb == true. + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=2.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --xent-regularize 0.1 \ + --leaky-hmm-coefficient 0.1 \ + --l2-regularize 0.00005 \ + --egs-dir exp/chain/tdnn_2y_sp/egs \ + --jesus-opts "--jesus-forward-input-dim 500 --jesus-forward-output-dim 1800 --jesus-hidden-dim 7500 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \ + --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3 -3,0,3 -6,-3,0" \ + --apply-deriv-weights false \ + --frames-per-iter 1200000 \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context 20 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_5n.sh b/egs/swbd/s5c/local/chain/run_tdnn_5n.sh new file mode 100755 index 00000000000..d4372a418d8 --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_5n.sh @@ -0,0 +1,459 @@ +#!/bin/bash + +# _5n is as _5j (also omitting the iVectors), but using double the input frame +# rate from 10 to 5 ms (and reducing frame width from 25 to 20), and modifying +# the splice indexes accordingly + +# A very nice improvement on dev; small improvement on eval2000 though. +#local/chain/compare_wer.sh 5j 5n +#System 5j 5n +#WER on train_dev(tg) 17.59 16.85 +#WER on train_dev(fg) 16.33 15.67 +#WER on eval2000(tg) 19.1 19.1 +#WER on eval2000(fg) 17.5 17.3 +#Final train prob -0.114691 -0.116341 +#Final valid prob -0.130761 -0.130884 + +# _5j is as _5e, but omitting the iVectors. + +# Definitely worse, although curiously, there is very little effect on the valid prob. +#./compare_wer.sh 5e 5j +#System 5e 5j +#WER on train_dev(tg) 15.43 17.59 +#WER on train_dev(fg) 14.32 16.33 +#WER on eval2000(tg) 17.3 19.1 +#WER on eval2000(fg) 15.5 17.5 +#Final train prob -0.110056 -0.114691 +#Final valid prob -0.129184 -0.130761 + + +# _5e is as _5b, but reducing --xent-regularize from 0.2 to 0.1 (since based on +# the results of 4v, 4w and 5c, it looks like 0.1 is better than 0.2 or 0.05). + +# The improvement is small but consistent (0.1, 0.1, 0.0, 0.1) and also seen +# in the train and valid probs. +#System 5b 5e +#WER on train_dev(tg) 15.51 15.43 +#WER on train_dev(fg) 14.39 14.32 +#WER on eval2000(tg) 17.3 17.3 +#WER on eval2000(fg) 15.6 15.5 +#Final train prob -0.112013 -0.110056 +#Final valid prob -0.130879 -0.129184 + +# _5b is as _5a, but adding --leaky-hmm-coefficient 0.1. + +# It does seem helpful on average: (-0.35, -0.35, -0.1, 0). +#./compare_wer.sh 5a 5b +#System 5a 5b +#WER on train_dev(tg) 15.86 15.51 +#WER on train_dev(fg) 14.74 14.39 +#WER on eval2000(tg) 17.4 17.3 +#WER on eval2000(fg) 15.6 15.6 +#Final train prob -0.0998359 -0.112013 +#Final valid prob -0.115884 -0.130879 + +# _5a is as _4w, but increasing jesus-forward-output-dim from 1400 to 1800, and +# jesus-forward-input-dim from 400 to 500. Hoping that the cross-entropy regularization +# will mean that the increased parameters are now helpful. + +# _4w is as _4v, but doubling --xent-regularize to 0.2 + +# _4v is as _4r, but with --xent-regularize 0.1. Increasing max_param_change +# from 1.0 to 2.0 because there is a lot of parameter change in the final xent +# layer, and this limits the rate of change of the other layers. + +# _4r is as _4f, but one more hidden layer, and reducing context of existing +# layers so we can re-use the egs. Reducing jesus-forward-output-dim slightly +# from 1500 to 1400. + +# This is better than 4f by almost all metrics. +# ./compare_wer.sh 4f 4r +# System 4f 4r +# WER on train_dev(tg) 16.83 16.50 +# WER on train_dev(fg) 15.73 15.45 +# WER on eval2000(tg) 18.4 18.3 +# WER on eval2000(fg) 16.6 16.7 +# Final train prob -0.105832 -0.103652 +# Final valid prob -0.123021 -0.121105 + +# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005. + +# It's even better than 4e, by about 0.3% abs. +# 4c 4e 4f +# Final valid prob: -0.1241 -0.1267 -0.1230 +# Final train prob: -0.08820 -0.1149 -0.1058 + +# ./show_wer.sh 4f +# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0 +# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 4e +# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0 +# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0 +# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys + + +# _4e is as _4c, but adding the option --l2-regularize 0.0001. + +# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000. + +# _4a is as _3s, but using narrower splice-indexes in the first layer. + +# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400. +# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p. +# This of course reduces overtraining. Results are a bit better than 3p but still +# not as good as 2y + +# ./show_wer.sh 3s +# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 3p +# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0 +# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0 +# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys +# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 2y +# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + + +# _3r is as _3p but reducing the number of parameters as it seemed to be +# overtraining (despite already being quite a small model): [600,1800 -> +# 500,1500]. Also in the interim there was a script change to +# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change. +# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent +# with the halving of the minibatch size.] + + +# _3p is the same as 3o, but after a code and script change so we can use +# natural gradient for the RepeatedAffineComponent. +# [natural gradient was helpful, based on logs; +# also made a change to use positive bias for the jesus-component affine parts.] + +# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2. + +# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support +# recurrence, with improvements to the learning of the jesus layers. + +# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found +# to be worse. +# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence +# is helpful.] +#./show_wer.sh 3g +#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys +#a03:s5c: ./show_wer.sh 2y +#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +#a03:s5c: ./show_wer.sh 3d +#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0 +#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +# _3f is as _3e, but modifying the splicing setup to add (left) recurrence: +# added the :3's in --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" +# Therefore it's +# no longer really a tdnn, more like an RNN combined with TDNN. BTW, I'm not re-dumping egs with extra +# context, and this isn't really ideal - I want to see if this seems promising first. + +# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default) +# to 200 in order to reduce computation in the Jesus layer. + +# _3d is as _2y, and re-using the egs, but using --jesus-opts and +# configs from make_jesus_configs.py. +# --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ +# --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" + +# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from +# 800k to 1.2 million. The aim is to avoid some of the per-job overhead +# (model-averaging, etc.), since each iteration takes only a minute or so. +# I added the results to the table below. It seems the same on average- +# which is good. We'll probably keep this configuration. + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# note: when I compare with the rerun of 2o (not shown), this run is actually +# better. +# WER on 2m 2o 2y [ now comparing 2o->2y:] +# train_dev,tg 17.22 17.24 16.99 0.2% better +# train_dev,fg 15.87 15.93 15.86 0.1% better +# eval2000,tg 18.7 18.7 18.9 0.2% worse +# eval2000,fg 17.0 16.9 17.0 0.1% worse + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_5n # Note: _sp will get added to this if $speed_perturb == true. + +# training options +num_epochs=2 # this is about the same amount of compute as the normal 4, since one + # epoch encompasses all frame-shifts of the data. +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=2.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=300 # doubling it, since we have half the frame rate. +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +# Generate double-frame-rate version of the data. +if [ $stage -le 12 ]; then + mfccdir=mfcc + for dataset in eval2000 train_dev; do ## ${train_set}; do + utils/copy_data_dir.sh data/$dataset data/${dataset}_hires_dbl + steps/make_mfcc.sh --cmd "$train_cmd" --nj 30 --mfcc-config conf/mfcc_hires_dbl.conf \ + data/${dataset}_hires_dbl exp/make_hires_dbl/$dataset $mfccdir; + steps/compute_cmvn_stats.sh data/${dataset}_hires_dbl exp/make_hires_dbl/$dataset $mfccdir; + utils/fix_data_dir.sh data/${dataset}_hires_dbl # remove segments with problems + done +fi + +if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --frame-subsampling-factor 6 \ + --alignment-subsampling-factor 3 \ + --xent-regularize 0.1 \ + --leaky-hmm-coefficient 0.1 \ + --l2-regularize 0.00005 \ + --jesus-opts "--jesus-forward-input-dim 500 --jesus-forward-output-dim 1800 --jesus-hidden-dim 7500 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \ + --splice-indexes "-1,0,1 -2,0,2 -4,-2,0,2 -6,0,6 -6,0,6 -12,-6,0" \ + --apply-deriv-weights false \ + --frames-per-iter 2400000 \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --feat-type raw \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires_dbl $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; + + echo "0.005" > $dir/frame_shift # this lets the sclite decoding script know + # what the frame shift was, in seconds. +fi + +if [ $stage -le 14 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 15 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context 20 \ + --nj 50 --cmd "$decode_cmd" \ + $graph_dir data/${decode_set}_hires_dbl $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires_dbl \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_5o.sh b/egs/swbd/s5c/local/chain/run_tdnn_5o.sh new file mode 100755 index 00000000000..86bbe1ad441 --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_5o.sh @@ -0,0 +1,467 @@ +#!/bin/bash + +# _5o is as _5n but adding an extra splicing layer and increasing the +# splice-width slightly on the 1st layer, to get closer to the context in 5n; +# having one more layer running at double-frequency, and reverting the frame-length to +# the same as in the baseline (25ms) to avoid sacrificing frequency resolution. + +# Objective functions improve but WER change is quite small vs 5n (~0.1%). so +# not clear that the extra time is worth it (it's noticeably slower to train as +# that extra layer is at a higher sampling rate). +# +#System 5j 5n 5o +#WER on train_dev(tg) 17.59 16.85 16.83 +#WER on train_dev(fg) 16.33 15.67 15.60 +#WER on eval2000(tg) 19.1 19.1 18.8 +#WER on eval2000(fg) 17.5 17.3 17.2 +#Final train prob -0.114691 -0.116341 -0.111613 +#Final valid prob -0.130761 -0.130884 -0.126765 + +# _5n is as _5j (also omitting the iVectors), but using double the input frame +# rate from 10 to 5 ms (and reducing frame width from 25 to 20), and modifying +# the splice indexes accordingly + +# _5j is as _5e, but omitting the iVectors. + +# Definitely worse, although curiously, there is very little effect on the valid prob. +#./compare_wer.sh 5e 5j +#System 5e 5j +#WER on train_dev(tg) 15.43 17.59 +#WER on train_dev(fg) 14.32 16.33 +#WER on eval2000(tg) 17.3 19.1 +#WER on eval2000(fg) 15.5 17.5 +#Final train prob -0.110056 -0.114691 +#Final valid prob -0.129184 -0.130761 + + +# _5e is as _5b, but reducing --xent-regularize from 0.2 to 0.1 (since based on +# the results of 4v, 4w and 5c, it looks like 0.1 is better than 0.2 or 0.05). + +# The improvement is small but consistent (0.1, 0.1, 0.0, 0.1) and also seen +# in the train and valid probs. +#System 5b 5e +#WER on train_dev(tg) 15.51 15.43 +#WER on train_dev(fg) 14.39 14.32 +#WER on eval2000(tg) 17.3 17.3 +#WER on eval2000(fg) 15.6 15.5 +#Final train prob -0.112013 -0.110056 +#Final valid prob -0.130879 -0.129184 + +# _5b is as _5a, but adding --leaky-hmm-coefficient 0.1. + +# It does seem helpful on average: (-0.35, -0.35, -0.1, 0). +#./compare_wer.sh 5a 5b +#System 5a 5b +#WER on train_dev(tg) 15.86 15.51 +#WER on train_dev(fg) 14.74 14.39 +#WER on eval2000(tg) 17.4 17.3 +#WER on eval2000(fg) 15.6 15.6 +#Final train prob -0.0998359 -0.112013 +#Final valid prob -0.115884 -0.130879 + +# _5a is as _4w, but increasing jesus-forward-output-dim from 1400 to 1800, and +# jesus-forward-input-dim from 400 to 500. Hoping that the cross-entropy regularization +# will mean that the increased parameters are now helpful. + +# _4w is as _4v, but doubling --xent-regularize to 0.2 + +# _4v is as _4r, but with --xent-regularize 0.1. Increasing max_param_change +# from 1.0 to 2.0 because there is a lot of parameter change in the final xent +# layer, and this limits the rate of change of the other layers. + +# _4r is as _4f, but one more hidden layer, and reducing context of existing +# layers so we can re-use the egs. Reducing jesus-forward-output-dim slightly +# from 1500 to 1400. + +# This is better than 4f by almost all metrics. +# ./compare_wer.sh 4f 4r +# System 4f 4r +# WER on train_dev(tg) 16.83 16.50 +# WER on train_dev(fg) 15.73 15.45 +# WER on eval2000(tg) 18.4 18.3 +# WER on eval2000(fg) 16.6 16.7 +# Final train prob -0.105832 -0.103652 +# Final valid prob -0.123021 -0.121105 + +# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005. + +# It's even better than 4e, by about 0.3% abs. +# 4c 4e 4f +# Final valid prob: -0.1241 -0.1267 -0.1230 +# Final train prob: -0.08820 -0.1149 -0.1058 + +# ./show_wer.sh 4f +# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0 +# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 4e +# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0 +# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0 +# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys + + +# _4e is as _4c, but adding the option --l2-regularize 0.0001. + +# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000. + +# _4a is as _3s, but using narrower splice-indexes in the first layer. + +# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400. +# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p. +# This of course reduces overtraining. Results are a bit better than 3p but still +# not as good as 2y + +# ./show_wer.sh 3s +# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 3p +# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0 +# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0 +# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys +# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 2y +# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + + +# _3r is as _3p but reducing the number of parameters as it seemed to be +# overtraining (despite already being quite a small model): [600,1800 -> +# 500,1500]. Also in the interim there was a script change to +# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change. +# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent +# with the halving of the minibatch size.] + + +# _3p is the same as 3o, but after a code and script change so we can use +# natural gradient for the RepeatedAffineComponent. +# [natural gradient was helpful, based on logs; +# also made a change to use positive bias for the jesus-component affine parts.] + +# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2. + +# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support +# recurrence, with improvements to the learning of the jesus layers. + +# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found +# to be worse. +# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence +# is helpful.] +#./show_wer.sh 3g +#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys +#a03:s5c: ./show_wer.sh 2y +#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +#a03:s5c: ./show_wer.sh 3d +#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0 +#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +# _3f is as _3e, but modifying the splicing setup to add (left) recurrence: +# added the :3's in --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" +# Therefore it's +# no longer really a tdnn, more like an RNN combined with TDNN. BTW, I'm not re-dumping egs with extra +# context, and this isn't really ideal - I want to see if this seems promising first. + +# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default) +# to 200 in order to reduce computation in the Jesus layer. + +# _3d is as _2y, and re-using the egs, but using --jesus-opts and +# configs from make_jesus_configs.py. +# --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ +# --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" + +# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from +# 800k to 1.2 million. The aim is to avoid some of the per-job overhead +# (model-averaging, etc.), since each iteration takes only a minute or so. +# I added the results to the table below. It seems the same on average- +# which is good. We'll probably keep this configuration. + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# note: when I compare with the rerun of 2o (not shown), this run is actually +# better. +# WER on 2m 2o 2y [ now comparing 2o->2y:] +# train_dev,tg 17.22 17.24 16.99 0.2% better +# train_dev,fg 15.87 15.93 15.86 0.1% better +# eval2000,tg 18.7 18.7 18.9 0.2% worse +# eval2000,fg 17.0 16.9 17.0 0.1% worse + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_5o # Note: _sp will get added to this if $speed_perturb == true. + +# training options +num_epochs=2 # this is about the same amount of compute as the normal 4, since one + # epoch encompasses all frame-shifts of the data. +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=2.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=300 # doubling it, since we have half the frame rate. +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +# Generate double-frame-rate version of the data. +if [ $stage -le 12 ]; then + mfccdir=mfcc + for dataset in eval2000 train_dev ${train_set}; do + utils/copy_data_dir.sh data/$dataset data/${dataset}_hires_dbl2 + steps/make_mfcc.sh --cmd "$train_cmd" --nj 30 --mfcc-config conf/mfcc_hires_dbl2.conf \ + data/${dataset}_hires_dbl2 exp/make_hires_dbl2/$dataset $mfccdir; + steps/compute_cmvn_stats.sh data/${dataset}_hires_dbl2 exp/make_hires_dbl2/$dataset $mfccdir; + utils/fix_data_dir.sh data/${dataset}_hires_dbl2 # remove segments with problems + done +fi + + +if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --frame-subsampling-factor 6 \ + --alignment-subsampling-factor 3 \ + --xent-regularize 0.1 \ + --leaky-hmm-coefficient 0.1 \ + --l2-regularize 0.00005 \ + --jesus-opts "--jesus-forward-input-dim 500 --jesus-forward-output-dim 1800 --jesus-hidden-dim 7500 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \ + --splice-indexes "-1,0,1 -2,-1,0,1,2 -2,0,2 -4,-2,0,2 -6,0,6 -6,0,6 -12,-6,0" \ + --apply-deriv-weights false \ + --frames-per-iter 2400000 \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --feat-type raw \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires_dbl2 $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; + + echo "0.005" > $dir/frame_shift # this lets the sclite decoding script know + # what the frame shift was, in seconds. +fi + +if [ $stage -le 14 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 15 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context 20 \ + --nj 50 --cmd "$decode_cmd" \ + $graph_dir data/${decode_set}_hires_dbl2 $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires_dbl2 \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_5p.sh b/egs/swbd/s5c/local/chain/run_tdnn_5p.sh new file mode 100755 index 00000000000..d2ef7057873 --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_5p.sh @@ -0,0 +1,421 @@ +#!/bin/bash + +# _5p is as _5e but adding (new option) --self-repair-scale 0.00001, to repair +# ReLUs that are over or under-saturated. [abandoned after discovering bug, +# this thread is picked up in 5s and 5t.] + +# _5e is as _5b, but reducing --xent-regularize from 0.2 to 0.1 (since based on +# the results of 4v, 4w and 5c, it looks like 0.1 is better than 0.2 or 0.05). + +# The improvement is small but consistent (0.1, 0.1, 0.0, 0.1) and also seen +# in the train and valid probs. +#System 5b 5e +#WER on train_dev(tg) 15.51 15.43 +#WER on train_dev(fg) 14.39 14.32 +#WER on eval2000(tg) 17.3 17.3 +#WER on eval2000(fg) 15.6 15.5 +#Final train prob -0.112013 -0.110056 +#Final valid prob -0.130879 -0.129184 + +# _5b is as _5a, but adding --leaky-hmm-coefficient 0.1. + +# It does seem helpful on average: (-0.35, -0.35, -0.1, 0). +#./compare_wer.sh 5a 5b +#System 5a 5b +#WER on train_dev(tg) 15.86 15.51 +#WER on train_dev(fg) 14.74 14.39 +#WER on eval2000(tg) 17.4 17.3 +#WER on eval2000(fg) 15.6 15.6 +#Final train prob -0.0998359 -0.112013 +#Final valid prob -0.115884 -0.130879 + +# _5a is as _4w, but increasing jesus-forward-output-dim from 1400 to 1800, and +# jesus-forward-input-dim from 400 to 500. Hoping that the cross-entropy regularization +# will mean that the increased parameters are now helpful. + +# _4w is as _4v, but doubling --xent-regularize to 0.2 + +# _4v is as _4r, but with --xent-regularize 0.1. Increasing max_param_change +# from 1.0 to 2.0 because there is a lot of parameter change in the final xent +# layer, and this limits the rate of change of the other layers. + +# _4r is as _4f, but one more hidden layer, and reducing context of existing +# layers so we can re-use the egs. Reducing jesus-forward-output-dim slightly +# from 1500 to 1400. + +# This is better than 4f by almost all metrics. +# ./compare_wer.sh 4f 4r +# System 4f 4r +# WER on train_dev(tg) 16.83 16.50 +# WER on train_dev(fg) 15.73 15.45 +# WER on eval2000(tg) 18.4 18.3 +# WER on eval2000(fg) 16.6 16.7 +# Final train prob -0.105832 -0.103652 +# Final valid prob -0.123021 -0.121105 + +# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005. + +# It's even better than 4e, by about 0.3% abs. +# 4c 4e 4f +# Final valid prob: -0.1241 -0.1267 -0.1230 +# Final train prob: -0.08820 -0.1149 -0.1058 + +# ./show_wer.sh 4f +# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0 +# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 4e +# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0 +# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0 +# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys + + +# _4e is as _4c, but adding the option --l2-regularize 0.0001. + +# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000. + +# _4a is as _3s, but using narrower splice-indexes in the first layer. + +# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400. +# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p. +# This of course reduces overtraining. Results are a bit better than 3p but still +# not as good as 2y + +# ./show_wer.sh 3s +# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 3p +# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0 +# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0 +# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys +# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 2y +# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + + +# _3r is as _3p but reducing the number of parameters as it seemed to be +# overtraining (despite already being quite a small model): [600,1800 -> +# 500,1500]. Also in the interim there was a script change to +# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change. +# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent +# with the halving of the minibatch size.] + + +# _3p is the same as 3o, but after a code and script change so we can use +# natural gradient for the RepeatedAffineComponent. +# [natural gradient was helpful, based on logs; +# also made a change to use positive bias for the jesus-component affine parts.] + +# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2. + +# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support +# recurrence, with improvements to the learning of the jesus layers. + +# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found +# to be worse. +# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence +# is helpful.] +#./show_wer.sh 3g +#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys +#a03:s5c: ./show_wer.sh 2y +#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +#a03:s5c: ./show_wer.sh 3d +#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0 +#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +# _3f is as _3e, but modifying the splicing setup to add (left) recurrence: +# added the :3's in --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" +# Therefore it's +# no longer really a tdnn, more like an RNN combined with TDNN. BTW, I'm not re-dumping egs with extra +# context, and this isn't really ideal - I want to see if this seems promising first. + +# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default) +# to 200 in order to reduce computation in the Jesus layer. + +# _3d is as _2y, and re-using the egs, but using --jesus-opts and +# configs from make_jesus_configs.py. +# --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ +# --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" + +# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from +# 800k to 1.2 million. The aim is to avoid some of the per-job overhead +# (model-averaging, etc.), since each iteration takes only a minute or so. +# I added the results to the table below. It seems the same on average- +# which is good. We'll probably keep this configuration. + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# note: when I compare with the rerun of 2o (not shown), this run is actually +# better. +# WER on 2m 2o 2y [ now comparing 2o->2y:] +# train_dev,tg 17.22 17.24 16.99 0.2% better +# train_dev,fg 15.87 15.93 15.86 0.1% better +# eval2000,tg 18.7 18.7 18.9 0.2% worse +# eval2000,fg 17.0 16.9 17.0 0.1% worse + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_5p # Note: _sp will get added to this if $speed_perturb == true. + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=2.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --xent-regularize 0.1 \ + --leaky-hmm-coefficient 0.1 \ + --l2-regularize 0.00005 \ + --egs-dir exp/chain/tdnn_2y_sp/egs \ + --jesus-opts "--jesus-forward-input-dim 500 --jesus-forward-output-dim 1800 --jesus-hidden-dim 7500 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25 --self-repair-scale 0.00001" \ + --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3 -3,0,3 -6,-3,0" \ + --apply-deriv-weights false \ + --frames-per-iter 1200000 \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context 20 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_5q.sh b/egs/swbd/s5c/local/chain/run_tdnn_5q.sh new file mode 100755 index 00000000000..5968a00417e --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_5q.sh @@ -0,0 +1,425 @@ +#!/bin/bash + +# _5q is as _5p but reducing jesus-forward-output-dim from 1800 to 1500 to try +# to compensate for the fact that more of the output dimensions are now being +# usefully used. [abandoned after discovering bug, +# this thread is picked up in 5s and 5t.] + +# _5p is as _5e but adding (new option) --self-repair-scale 0.00001, to repair +# ReLUs that are over or under-saturated. + +# _5e is as _5b, but reducing --xent-regularize from 0.2 to 0.1 (since based on +# the results of 4v, 4w and 5c, it looks like 0.1 is better than 0.2 or 0.05). + +# The improvement is small but consistent (0.1, 0.1, 0.0, 0.1) and also seen +# in the train and valid probs. +#System 5b 5e +#WER on train_dev(tg) 15.51 15.43 +#WER on train_dev(fg) 14.39 14.32 +#WER on eval2000(tg) 17.3 17.3 +#WER on eval2000(fg) 15.6 15.5 +#Final train prob -0.112013 -0.110056 +#Final valid prob -0.130879 -0.129184 + +# _5b is as _5a, but adding --leaky-hmm-coefficient 0.1. + +# It does seem helpful on average: (-0.35, -0.35, -0.1, 0). +#./compare_wer.sh 5a 5b +#System 5a 5b +#WER on train_dev(tg) 15.86 15.51 +#WER on train_dev(fg) 14.74 14.39 +#WER on eval2000(tg) 17.4 17.3 +#WER on eval2000(fg) 15.6 15.6 +#Final train prob -0.0998359 -0.112013 +#Final valid prob -0.115884 -0.130879 + +# _5a is as _4w, but increasing jesus-forward-output-dim from 1400 to 1800, and +# jesus-forward-input-dim from 400 to 500. Hoping that the cross-entropy regularization +# will mean that the increased parameters are now helpful. + +# _4w is as _4v, but doubling --xent-regularize to 0.2 + +# _4v is as _4r, but with --xent-regularize 0.1. Increasing max_param_change +# from 1.0 to 2.0 because there is a lot of parameter change in the final xent +# layer, and this limits the rate of change of the other layers. + +# _4r is as _4f, but one more hidden layer, and reducing context of existing +# layers so we can re-use the egs. Reducing jesus-forward-output-dim slightly +# from 1500 to 1400. + +# This is better than 4f by almost all metrics. +# ./compare_wer.sh 4f 4r +# System 4f 4r +# WER on train_dev(tg) 16.83 16.50 +# WER on train_dev(fg) 15.73 15.45 +# WER on eval2000(tg) 18.4 18.3 +# WER on eval2000(fg) 16.6 16.7 +# Final train prob -0.105832 -0.103652 +# Final valid prob -0.123021 -0.121105 + +# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005. + +# It's even better than 4e, by about 0.3% abs. +# 4c 4e 4f +# Final valid prob: -0.1241 -0.1267 -0.1230 +# Final train prob: -0.08820 -0.1149 -0.1058 + +# ./show_wer.sh 4f +# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0 +# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 4e +# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0 +# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0 +# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys + + +# _4e is as _4c, but adding the option --l2-regularize 0.0001. + +# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000. + +# _4a is as _3s, but using narrower splice-indexes in the first layer. + +# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400. +# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p. +# This of course reduces overtraining. Results are a bit better than 3p but still +# not as good as 2y + +# ./show_wer.sh 3s +# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 3p +# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0 +# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0 +# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys +# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 2y +# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + + +# _3r is as _3p but reducing the number of parameters as it seemed to be +# overtraining (despite already being quite a small model): [600,1800 -> +# 500,1500]. Also in the interim there was a script change to +# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change. +# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent +# with the halving of the minibatch size.] + + +# _3p is the same as 3o, but after a code and script change so we can use +# natural gradient for the RepeatedAffineComponent. +# [natural gradient was helpful, based on logs; +# also made a change to use positive bias for the jesus-component affine parts.] + +# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2. + +# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support +# recurrence, with improvements to the learning of the jesus layers. + +# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found +# to be worse. +# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence +# is helpful.] +#./show_wer.sh 3g +#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys +#a03:s5c: ./show_wer.sh 2y +#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +#a03:s5c: ./show_wer.sh 3d +#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0 +#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +# _3f is as _3e, but modifying the splicing setup to add (left) recurrence: +# added the :3's in --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" +# Therefore it's +# no longer really a tdnn, more like an RNN combined with TDNN. BTW, I'm not re-dumping egs with extra +# context, and this isn't really ideal - I want to see if this seems promising first. + +# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default) +# to 200 in order to reduce computation in the Jesus layer. + +# _3d is as _2y, and re-using the egs, but using --jesus-opts and +# configs from make_jesus_configs.py. +# --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ +# --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" + +# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from +# 800k to 1.2 million. The aim is to avoid some of the per-job overhead +# (model-averaging, etc.), since each iteration takes only a minute or so. +# I added the results to the table below. It seems the same on average- +# which is good. We'll probably keep this configuration. + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# note: when I compare with the rerun of 2o (not shown), this run is actually +# better. +# WER on 2m 2o 2y [ now comparing 2o->2y:] +# train_dev,tg 17.22 17.24 16.99 0.2% better +# train_dev,fg 15.87 15.93 15.86 0.1% better +# eval2000,tg 18.7 18.7 18.9 0.2% worse +# eval2000,fg 17.0 16.9 17.0 0.1% worse + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_5q # Note: _sp will get added to this if $speed_perturb == true. + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=2.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --xent-regularize 0.1 \ + --leaky-hmm-coefficient 0.1 \ + --l2-regularize 0.00005 \ + --egs-dir exp/chain/tdnn_2y_sp/egs \ + --jesus-opts "--jesus-forward-input-dim 500 --jesus-forward-output-dim 1500 --jesus-hidden-dim 7500 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25 --self-repair-scale 0.00001" \ + --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3 -3,0,3 -6,-3,0" \ + --apply-deriv-weights false \ + --frames-per-iter 1200000 \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context 20 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_5r.sh b/egs/swbd/s5c/local/chain/run_tdnn_5r.sh new file mode 100755 index 00000000000..306d76859f9 --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_5r.sh @@ -0,0 +1,427 @@ +#!/bin/bash + +# _5r is as _5q but also reducing --jesus-hidden-dim from 7500 to 5000. +# [abandoned after discovering bug, this thread is picked up in 5s and 5t.] + +# _5q is as _5p but reducing jesus-forward-output-dim from 1800 to 1500 to try +# to compensate for the fact that more of the output dimensions are now being +# usefully used. + +# _5p is as _5e but adding (new option) --self-repair-scale 0.00001, to repair +# ReLUs that are over or under-saturated. + +# _5e is as _5b, but reducing --xent-regularize from 0.2 to 0.1 (since based on +# the results of 4v, 4w and 5c, it looks like 0.1 is better than 0.2 or 0.05). + +# The improvement is small but consistent (0.1, 0.1, 0.0, 0.1) and also seen +# in the train and valid probs. +#System 5b 5e +#WER on train_dev(tg) 15.51 15.43 +#WER on train_dev(fg) 14.39 14.32 +#WER on eval2000(tg) 17.3 17.3 +#WER on eval2000(fg) 15.6 15.5 +#Final train prob -0.112013 -0.110056 +#Final valid prob -0.130879 -0.129184 + +# _5b is as _5a, but adding --leaky-hmm-coefficient 0.1. + +# It does seem helpful on average: (-0.35, -0.35, -0.1, 0). +#./compare_wer.sh 5a 5b +#System 5a 5b +#WER on train_dev(tg) 15.86 15.51 +#WER on train_dev(fg) 14.74 14.39 +#WER on eval2000(tg) 17.4 17.3 +#WER on eval2000(fg) 15.6 15.6 +#Final train prob -0.0998359 -0.112013 +#Final valid prob -0.115884 -0.130879 + +# _5a is as _4w, but increasing jesus-forward-output-dim from 1400 to 1800, and +# jesus-forward-input-dim from 400 to 500. Hoping that the cross-entropy regularization +# will mean that the increased parameters are now helpful. + +# _4w is as _4v, but doubling --xent-regularize to 0.2 + +# _4v is as _4r, but with --xent-regularize 0.1. Increasing max_param_change +# from 1.0 to 2.0 because there is a lot of parameter change in the final xent +# layer, and this limits the rate of change of the other layers. + +# _4r is as _4f, but one more hidden layer, and reducing context of existing +# layers so we can re-use the egs. Reducing jesus-forward-output-dim slightly +# from 1500 to 1400. + +# This is better than 4f by almost all metrics. +# ./compare_wer.sh 4f 4r +# System 4f 4r +# WER on train_dev(tg) 16.83 16.50 +# WER on train_dev(fg) 15.73 15.45 +# WER on eval2000(tg) 18.4 18.3 +# WER on eval2000(fg) 16.6 16.7 +# Final train prob -0.105832 -0.103652 +# Final valid prob -0.123021 -0.121105 + +# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005. + +# It's even better than 4e, by about 0.3% abs. +# 4c 4e 4f +# Final valid prob: -0.1241 -0.1267 -0.1230 +# Final train prob: -0.08820 -0.1149 -0.1058 + +# ./show_wer.sh 4f +# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0 +# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 4e +# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0 +# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0 +# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys + + +# _4e is as _4c, but adding the option --l2-regularize 0.0001. + +# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000. + +# _4a is as _3s, but using narrower splice-indexes in the first layer. + +# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400. +# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p. +# This of course reduces overtraining. Results are a bit better than 3p but still +# not as good as 2y + +# ./show_wer.sh 3s +# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 3p +# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0 +# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0 +# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys +# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 2y +# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + + +# _3r is as _3p but reducing the number of parameters as it seemed to be +# overtraining (despite already being quite a small model): [600,1800 -> +# 500,1500]. Also in the interim there was a script change to +# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change. +# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent +# with the halving of the minibatch size.] + + +# _3p is the same as 3o, but after a code and script change so we can use +# natural gradient for the RepeatedAffineComponent. +# [natural gradient was helpful, based on logs; +# also made a change to use positive bias for the jesus-component affine parts.] + +# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2. + +# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support +# recurrence, with improvements to the learning of the jesus layers. + +# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found +# to be worse. +# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence +# is helpful.] +#./show_wer.sh 3g +#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys +#a03:s5c: ./show_wer.sh 2y +#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +#a03:s5c: ./show_wer.sh 3d +#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0 +#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +# _3f is as _3e, but modifying the splicing setup to add (left) recurrence: +# added the :3's in --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" +# Therefore it's +# no longer really a tdnn, more like an RNN combined with TDNN. BTW, I'm not re-dumping egs with extra +# context, and this isn't really ideal - I want to see if this seems promising first. + +# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default) +# to 200 in order to reduce computation in the Jesus layer. + +# _3d is as _2y, and re-using the egs, but using --jesus-opts and +# configs from make_jesus_configs.py. +# --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ +# --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" + +# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from +# 800k to 1.2 million. The aim is to avoid some of the per-job overhead +# (model-averaging, etc.), since each iteration takes only a minute or so. +# I added the results to the table below. It seems the same on average- +# which is good. We'll probably keep this configuration. + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# note: when I compare with the rerun of 2o (not shown), this run is actually +# better. +# WER on 2m 2o 2y [ now comparing 2o->2y:] +# train_dev,tg 17.22 17.24 16.99 0.2% better +# train_dev,fg 15.87 15.93 15.86 0.1% better +# eval2000,tg 18.7 18.7 18.9 0.2% worse +# eval2000,fg 17.0 16.9 17.0 0.1% worse + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_5r # Note: _sp will get added to this if $speed_perturb == true. + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=2.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --xent-regularize 0.1 \ + --leaky-hmm-coefficient 0.1 \ + --l2-regularize 0.00005 \ + --egs-dir exp/chain/tdnn_2y_sp/egs \ + --jesus-opts "--jesus-forward-input-dim 500 --jesus-forward-output-dim 1500 --jesus-hidden-dim 5000 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25 --self-repair-scale 0.00001" \ + --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3 -3,0,3 -6,-3,0" \ + --apply-deriv-weights false \ + --frames-per-iter 1200000 \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context 20 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_5s.sh b/egs/swbd/s5c/local/chain/run_tdnn_5s.sh new file mode 100755 index 00000000000..65da1e06183 --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_5s.sh @@ -0,0 +1,441 @@ +#!/bin/bash + +# Comparing with 5e which is the most recent baseline we actually decoded, +# 5s is as 5e but with jesus-forward-output-dim reduced 1800->1700, +# jesus-hidden-dim reduced 7500 to 5000, and +# and the new option --self-repair-scale 0.00001 added. +# Also compare 5t and 5v which have even smaller j3sus-hidden-dims. + +#./compare_wer.sh 5e 5s 5t +#System 5e 5s 5t +#WER on train_dev(tg) 15.43 15.47 15.43 +#WER on train_dev(fg) 14.32 14.31 14.34 +#WER on eval2000(tg) 17.3 17.4 17.4 +#WER on eval2000(fg) 15.5 15.6 15.6 +#Final train prob -0.110056 -0.110928 -0.110752 +#Final valid prob -0.129184 -0.132139 -0.129123 + +# _5r is as _5q but also reducing --jesus-hidden-dim from 7500 to 5000. + +# _5q is as _5p but reducing jesus-forward-output-dim from 1800 to 1500 to try +# to compensate for the fact that more of the output dimensions are now being +# usefully used. + +# _5p is as _5e but adding (new option) --self-repair-scale 0.00001, to repair +# ReLUs that are over or under-saturated. + +# _5e is as _5b, but reducing --xent-regularize from 0.2 to 0.1 (since based on +# the results of 4v, 4w and 5c, it looks like 0.1 is better than 0.2 or 0.05). + +# The improvement is small but consistent (0.1, 0.1, 0.0, 0.1) and also seen +# in the train and valid probs. +#System 5b 5e +#WER on train_dev(tg) 15.51 15.43 +#WER on train_dev(fg) 14.39 14.32 +#WER on eval2000(tg) 17.3 17.3 +#WER on eval2000(fg) 15.6 15.5 +#Final train prob -0.112013 -0.110056 +#Final valid prob -0.130879 -0.129184 + +# _5b is as _5a, but adding --leaky-hmm-coefficient 0.1. + +# It does seem helpful on average: (-0.35, -0.35, -0.1, 0). +#./compare_wer.sh 5a 5b +#System 5a 5b +#WER on train_dev(tg) 15.86 15.51 +#WER on train_dev(fg) 14.74 14.39 +#WER on eval2000(tg) 17.4 17.3 +#WER on eval2000(fg) 15.6 15.6 +#Final train prob -0.0998359 -0.112013 +#Final valid prob -0.115884 -0.130879 + +# _5a is as _4w, but increasing jesus-forward-output-dim from 1400 to 1800, and +# jesus-forward-input-dim from 400 to 500. Hoping that the cross-entropy regularization +# will mean that the increased parameters are now helpful. + +# _4w is as _4v, but doubling --xent-regularize to 0.2 + +# _4v is as _4r, but with --xent-regularize 0.1. Increasing max_param_change +# from 1.0 to 2.0 because there is a lot of parameter change in the final xent +# layer, and this limits the rate of change of the other layers. + +# _4r is as _4f, but one more hidden layer, and reducing context of existing +# layers so we can re-use the egs. Reducing jesus-forward-output-dim slightly +# from 1500 to 1400. + +# This is better than 4f by almost all metrics. +# ./compare_wer.sh 4f 4r +# System 4f 4r +# WER on train_dev(tg) 16.83 16.50 +# WER on train_dev(fg) 15.73 15.45 +# WER on eval2000(tg) 18.4 18.3 +# WER on eval2000(fg) 16.6 16.7 +# Final train prob -0.105832 -0.103652 +# Final valid prob -0.123021 -0.121105 + +# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005. + +# It's even better than 4e, by about 0.3% abs. +# 4c 4e 4f +# Final valid prob: -0.1241 -0.1267 -0.1230 +# Final train prob: -0.08820 -0.1149 -0.1058 + +# ./show_wer.sh 4f +# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0 +# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 4e +# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0 +# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0 +# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys + + +# _4e is as _4c, but adding the option --l2-regularize 0.0001. + +# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000. + +# _4a is as _3s, but using narrower splice-indexes in the first layer. + +# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400. +# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p. +# This of course reduces overtraining. Results are a bit better than 3p but still +# not as good as 2y + +# ./show_wer.sh 3s +# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 3p +# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0 +# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0 +# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys +# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 2y +# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + + +# _3r is as _3p but reducing the number of parameters as it seemed to be +# overtraining (despite already being quite a small model): [600,1800 -> +# 500,1500]. Also in the interim there was a script change to +# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change. +# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent +# with the halving of the minibatch size.] + + +# _3p is the same as 3o, but after a code and script change so we can use +# natural gradient for the RepeatedAffineComponent. +# [natural gradient was helpful, based on logs; +# also made a change to use positive bias for the jesus-component affine parts.] + +# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2. + +# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support +# recurrence, with improvements to the learning of the jesus layers. + +# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found +# to be worse. +# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence +# is helpful.] +#./show_wer.sh 3g +#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys +#a03:s5c: ./show_wer.sh 2y +#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +#a03:s5c: ./show_wer.sh 3d +#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0 +#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +# _3f is as _3e, but modifying the splicing setup to add (left) recurrence: +# added the :3's in --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" +# Therefore it's +# no longer really a tdnn, more like an RNN combined with TDNN. BTW, I'm not re-dumping egs with extra +# context, and this isn't really ideal - I want to see if this seems promising first. + +# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default) +# to 200 in order to reduce computation in the Jesus layer. + +# _3d is as _2y, and re-using the egs, but using --jesus-opts and +# configs from make_jesus_configs.py. +# --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ +# --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" + +# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from +# 800k to 1.2 million. The aim is to avoid some of the per-job overhead +# (model-averaging, etc.), since each iteration takes only a minute or so. +# I added the results to the table below. It seems the same on average- +# which is good. We'll probably keep this configuration. + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# note: when I compare with the rerun of 2o (not shown), this run is actually +# better. +# WER on 2m 2o 2y [ now comparing 2o->2y:] +# train_dev,tg 17.22 17.24 16.99 0.2% better +# train_dev,fg 15.87 15.93 15.86 0.1% better +# eval2000,tg 18.7 18.7 18.9 0.2% worse +# eval2000,fg 17.0 16.9 17.0 0.1% worse + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_5s # Note: _sp will get added to this if $speed_perturb == true. + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=2.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --xent-regularize 0.1 \ + --leaky-hmm-coefficient 0.1 \ + --l2-regularize 0.00005 \ + --egs-dir exp/chain/tdnn_2y_sp/egs \ + --jesus-opts "--jesus-forward-input-dim 500 --jesus-forward-output-dim 1700 --jesus-hidden-dim 5000 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25 --self-repair-scale 0.00001" \ + --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3 -3,0,3 -6,-3,0" \ + --apply-deriv-weights false \ + --frames-per-iter 1200000 \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context 20 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_5t.sh b/egs/swbd/s5c/local/chain/run_tdnn_5t.sh new file mode 100755 index 00000000000..9831417003b --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_5t.sh @@ -0,0 +1,445 @@ +#!/bin/bash + +# _5t is as _5s but further reducing the jesus-hidden-dim (trying to speed it +# up), from 5000 to 3500. Seems to make no difference to WERs; valid prob improves. + +#local/chain/compare_wer.sh 5e 5s 5t +#System 5e 5s 5t +#WER on train_dev(tg) 15.43 15.47 15.43 +#WER on train_dev(fg) 14.32 14.31 14.34 +#WER on eval2000(tg) 17.3 17.4 17.4 +#WER on eval2000(fg) 15.5 15.6 15.6 +#Final train prob -0.110056 -0.110928 -0.110752 +#Final valid prob -0.129184 -0.132139 -0.129123 + +# _5s is as _5r but increasing the jesus-forward-output-dim to the intermediate +# value of 1700 (between 1500 and 1800), and also a bug-fix in the self-repair +# code to a bug which was doubling the thresholds so there was, in effect, +# no upper threshold. I stopped the p,q,r runs after I found this, but in +# configuring this run I'm bearing in mind the train and valid probs from the +# p,q,r runs. + +# _5r is as _5q but also reducing --jesus-hidden-dim from 7500 to 5000. + +# _5q is as _5p but reducing jesus-forward-output-dim from 1800 to 1500 to try +# to compensate for the fact that more of the output dimensions are now being +# usefully used. + +# _5p is as _5e but adding (new option) --self-repair-scale 0.00001, to repair +# ReLUs that are over or under-saturated. + +# _5e is as _5b, but reducing --xent-regularize from 0.2 to 0.1 (since based on +# the results of 4v, 4w and 5c, it looks like 0.1 is better than 0.2 or 0.05). + +# The improvement is small but consistent (0.1, 0.1, 0.0, 0.1) and also seen +# in the train and valid probs. +#System 5b 5e +#WER on train_dev(tg) 15.51 15.43 +#WER on train_dev(fg) 14.39 14.32 +#WER on eval2000(tg) 17.3 17.3 +#WER on eval2000(fg) 15.6 15.5 +#Final train prob -0.112013 -0.110056 +#Final valid prob -0.130879 -0.129184 + +# _5b is as _5a, but adding --leaky-hmm-coefficient 0.1. + +# It does seem helpful on average: (-0.35, -0.35, -0.1, 0). +#./compare_wer.sh 5a 5b +#System 5a 5b +#WER on train_dev(tg) 15.86 15.51 +#WER on train_dev(fg) 14.74 14.39 +#WER on eval2000(tg) 17.4 17.3 +#WER on eval2000(fg) 15.6 15.6 +#Final train prob -0.0998359 -0.112013 +#Final valid prob -0.115884 -0.130879 + +# _5a is as _4w, but increasing jesus-forward-output-dim from 1400 to 1800, and +# jesus-forward-input-dim from 400 to 500. Hoping that the cross-entropy regularization +# will mean that the increased parameters are now helpful. + +# _4w is as _4v, but doubling --xent-regularize to 0.2 + +# _4v is as _4r, but with --xent-regularize 0.1. Increasing max_param_change +# from 1.0 to 2.0 because there is a lot of parameter change in the final xent +# layer, and this limits the rate of change of the other layers. + +# _4r is as _4f, but one more hidden layer, and reducing context of existing +# layers so we can re-use the egs. Reducing jesus-forward-output-dim slightly +# from 1500 to 1400. + +# This is better than 4f by almost all metrics. +# ./compare_wer.sh 4f 4r +# System 4f 4r +# WER on train_dev(tg) 16.83 16.50 +# WER on train_dev(fg) 15.73 15.45 +# WER on eval2000(tg) 18.4 18.3 +# WER on eval2000(fg) 16.6 16.7 +# Final train prob -0.105832 -0.103652 +# Final valid prob -0.123021 -0.121105 + +# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005. + +# It's even better than 4e, by about 0.3% abs. +# 4c 4e 4f +# Final valid prob: -0.1241 -0.1267 -0.1230 +# Final train prob: -0.08820 -0.1149 -0.1058 + +# ./show_wer.sh 4f +# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0 +# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 4e +# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0 +# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0 +# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys + + +# _4e is as _4c, but adding the option --l2-regularize 0.0001. + +# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000. + +# _4a is as _3s, but using narrower splice-indexes in the first layer. + +# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400. +# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p. +# This of course reduces overtraining. Results are a bit better than 3p but still +# not as good as 2y + +# ./show_wer.sh 3s +# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 3p +# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0 +# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0 +# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys +# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 2y +# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + + +# _3r is as _3p but reducing the number of parameters as it seemed to be +# overtraining (despite already being quite a small model): [600,1800 -> +# 500,1500]. Also in the interim there was a script change to +# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change. +# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent +# with the halving of the minibatch size.] + + +# _3p is the same as 3o, but after a code and script change so we can use +# natural gradient for the RepeatedAffineComponent. +# [natural gradient was helpful, based on logs; +# also made a change to use positive bias for the jesus-component affine parts.] + +# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2. + +# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support +# recurrence, with improvements to the learning of the jesus layers. + +# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found +# to be worse. +# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence +# is helpful.] +#./show_wer.sh 3g +#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys +#a03:s5c: ./show_wer.sh 2y +#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +#a03:s5c: ./show_wer.sh 3d +#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0 +#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +# _3f is as _3e, but modifying the splicing setup to add (left) recurrence: +# added the :3's in --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" +# Therefore it's +# no longer really a tdnn, more like an RNN combined with TDNN. BTW, I'm not re-dumping egs with extra +# context, and this isn't really ideal - I want to see if this seems promising first. + +# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default) +# to 200 in order to reduce computation in the Jesus layer. + +# _3d is as _2y, and re-using the egs, but using --jesus-opts and +# configs from make_jesus_configs.py. +# --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ +# --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" + +# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from +# 800k to 1.2 million. The aim is to avoid some of the per-job overhead +# (model-averaging, etc.), since each iteration takes only a minute or so. +# I added the results to the table below. It seems the same on average- +# which is good. We'll probably keep this configuration. + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# note: when I compare with the rerun of 2o (not shown), this run is actually +# better. +# WER on 2m 2o 2y [ now comparing 2o->2y:] +# train_dev,tg 17.22 17.24 16.99 0.2% better +# train_dev,fg 15.87 15.93 15.86 0.1% better +# eval2000,tg 18.7 18.7 18.9 0.2% worse +# eval2000,fg 17.0 16.9 17.0 0.1% worse + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_5t # Note: _sp will get added to this if $speed_perturb == true. + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=2.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --xent-regularize 0.1 \ + --leaky-hmm-coefficient 0.1 \ + --l2-regularize 0.00005 \ + --egs-dir exp/chain/tdnn_2y_sp/egs \ + --jesus-opts "--jesus-forward-input-dim 500 --jesus-forward-output-dim 1700 --jesus-hidden-dim 3500 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25 --self-repair-scale 0.00001" \ + --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3 -3,0,3 -6,-3,0" \ + --apply-deriv-weights false \ + --frames-per-iter 1200000 \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context 20 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_5u.sh b/egs/swbd/s5c/local/chain/run_tdnn_5u.sh new file mode 100755 index 00000000000..34fe30993cf --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_5u.sh @@ -0,0 +1,505 @@ +#!/bin/bash + +# _5u is as _5o but modifying the mfcc generation to use a narrower window while +# generating the lower-order mfcc coefficients (the first 10). + +# Abandoning it partway through after I got the following less-than-promising diagnostics. +# grep Overall exp/chain/tdnn_5{o,u}_sp/log/compute_prob_valid.84.log | grep -v xent +# exp/chain/tdnn_5o_sp/log/compute_prob_valid.84.log:LOG (nnet3-chain-compute-prob:PrintTotalStats():nnet-chain-diagnostics.cc:175) Overall log-probability for 'output' is -0.146977 + -0.0159528 = -0.16293 per frame, over 20000 frames. +# exp/chain/tdnn_5u_sp/log/compute_prob_valid.84.log:LOG (nnet3-chain-compute-prob:PrintTotalStats():nnet-chain-diagnostics.cc:175) Overall log-probability for 'output' is -0.147207 + -0.015692 = -0.162899 per frame, over 20000 frames. +# a03:s5c: grep Overall exp/chain/tdnn_5{o,u}_sp/log/compute_prob_train.84.log | grep -v xent +# exp/chain/tdnn_5o_sp/log/compute_prob_train.84.log:LOG (nnet3-chain-compute-prob:PrintTotalStats():nnet-chain-diagnostics.cc:175) Overall log-probability for 'output' is -0.146703 + -0.0165036 = -0.163207 per frame, over 20000 frames. +# exp/chain/tdnn_5u_sp/log/compute_prob_train.84.log:LOG (nnet3-chain-compute-prob:PrintTotalStats():nnet-chain-diagnostics.cc:175) Overall log-probability for 'output' is -0.145524 + -0.0162272 = -0.161751 per frame, over 20000 frames. + +# _5o is as _5n but adding an extra splicing layer and increasing the +# splice-width slightly on the 1st layer, to get closer to the context in 5n; +# having one more layer running at double-frequency, and reverting the frame-length to +# the same as in the baseline (25ms) to avoid sacrificing frequency resolution. + +# Objective functions improve but WER change is quite small vs 5n (~0.1%). so +# not clear that the extra time is worth it (it's noticeably slower to train as +# that extra layer is at a higher sampling rate). +# +#System 5j 5n 5o +#WER on train_dev(tg) 17.59 16.85 16.83 +#WER on train_dev(fg) 16.33 15.67 15.60 +#WER on eval2000(tg) 19.1 19.1 18.8 +#WER on eval2000(fg) 17.5 17.3 17.2 +#Final train prob -0.114691 -0.116341 -0.111613 +#Final valid prob -0.130761 -0.130884 -0.126765 + +# _5n is as _5j (also omitting the iVectors), but using double the input frame +# rate from 10 to 5 ms (and reducing frame width from 25 to 20), and modifying +# the splice indexes accordingly + +# _5j is as _5e, but omitting the iVectors. + +# Definitely worse, although curiously, there is very little effect on the valid prob. +#./compare_wer.sh 5e 5j +#System 5e 5j +#WER on train_dev(tg) 15.43 17.59 +#WER on train_dev(fg) 14.32 16.33 +#WER on eval2000(tg) 17.3 19.1 +#WER on eval2000(fg) 15.5 17.5 +#Final train prob -0.110056 -0.114691 +#Final valid prob -0.129184 -0.130761 + + +# _5e is as _5b, but reducing --xent-regularize from 0.2 to 0.1 (since based on +# the results of 4v, 4w and 5c, it looks like 0.1 is better than 0.2 or 0.05). + +# The improvement is small but consistent (0.1, 0.1, 0.0, 0.1) and also seen +# in the train and valid probs. +#System 5b 5e +#WER on train_dev(tg) 15.51 15.43 +#WER on train_dev(fg) 14.39 14.32 +#WER on eval2000(tg) 17.3 17.3 +#WER on eval2000(fg) 15.6 15.5 +#Final train prob -0.112013 -0.110056 +#Final valid prob -0.130879 -0.129184 + +# _5b is as _5a, but adding --leaky-hmm-coefficient 0.1. + +# It does seem helpful on average: (-0.35, -0.35, -0.1, 0). +#./compare_wer.sh 5a 5b +#System 5a 5b +#WER on train_dev(tg) 15.86 15.51 +#WER on train_dev(fg) 14.74 14.39 +#WER on eval2000(tg) 17.4 17.3 +#WER on eval2000(fg) 15.6 15.6 +#Final train prob -0.0998359 -0.112013 +#Final valid prob -0.115884 -0.130879 + +# _5a is as _4w, but increasing jesus-forward-output-dim from 1400 to 1800, and +# jesus-forward-input-dim from 400 to 500. Hoping that the cross-entropy regularization +# will mean that the increased parameters are now helpful. + +# _4w is as _4v, but doubling --xent-regularize to 0.2 + +# _4v is as _4r, but with --xent-regularize 0.1. Increasing max_param_change +# from 1.0 to 2.0 because there is a lot of parameter change in the final xent +# layer, and this limits the rate of change of the other layers. + +# _4r is as _4f, but one more hidden layer, and reducing context of existing +# layers so we can re-use the egs. Reducing jesus-forward-output-dim slightly +# from 1500 to 1400. + +# This is better than 4f by almost all metrics. +# ./compare_wer.sh 4f 4r +# System 4f 4r +# WER on train_dev(tg) 16.83 16.50 +# WER on train_dev(fg) 15.73 15.45 +# WER on eval2000(tg) 18.4 18.3 +# WER on eval2000(fg) 16.6 16.7 +# Final train prob -0.105832 -0.103652 +# Final valid prob -0.123021 -0.121105 + +# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005. + +# It's even better than 4e, by about 0.3% abs. +# 4c 4e 4f +# Final valid prob: -0.1241 -0.1267 -0.1230 +# Final train prob: -0.08820 -0.1149 -0.1058 + +# ./show_wer.sh 4f +# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0 +# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 4e +# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0 +# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0 +# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys + + +# _4e is as _4c, but adding the option --l2-regularize 0.0001. + +# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000. + +# _4a is as _3s, but using narrower splice-indexes in the first layer. + +# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400. +# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p. +# This of course reduces overtraining. Results are a bit better than 3p but still +# not as good as 2y + +# ./show_wer.sh 3s +# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 3p +# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0 +# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0 +# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys +# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 2y +# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + + +# _3r is as _3p but reducing the number of parameters as it seemed to be +# overtraining (despite already being quite a small model): [600,1800 -> +# 500,1500]. Also in the interim there was a script change to +# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change. +# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent +# with the halving of the minibatch size.] + + +# _3p is the same as 3o, but after a code and script change so we can use +# natural gradient for the RepeatedAffineComponent. +# [natural gradient was helpful, based on logs; +# also made a change to use positive bias for the jesus-component affine parts.] + +# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2. + +# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support +# recurrence, with improvements to the learning of the jesus layers. + +# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found +# to be worse. +# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence +# is helpful.] +#./show_wer.sh 3g +#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys +#a03:s5c: ./show_wer.sh 2y +#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +#a03:s5c: ./show_wer.sh 3d +#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0 +#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +# _3f is as _3e, but modifying the splicing setup to add (left) recurrence: +# added the :3's in --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" +# Therefore it's +# no longer really a tdnn, more like an RNN combined with TDNN. BTW, I'm not re-dumping egs with extra +# context, and this isn't really ideal - I want to see if this seems promising first. + +# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default) +# to 200 in order to reduce computation in the Jesus layer. + +# _3d is as _2y, and re-using the egs, but using --jesus-opts and +# configs from make_jesus_configs.py. +# --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ +# --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" + +# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from +# 800k to 1.2 million. The aim is to avoid some of the per-job overhead +# (model-averaging, etc.), since each iteration takes only a minute or so. +# I added the results to the table below. It seems the same on average- +# which is good. We'll probably keep this configuration. + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# note: when I compare with the rerun of 2o (not shown), this run is actually +# better. +# WER on 2m 2o 2y [ now comparing 2o->2y:] +# train_dev,tg 17.22 17.24 16.99 0.2% better +# train_dev,fg 15.87 15.93 15.86 0.1% better +# eval2000,tg 18.7 18.7 18.9 0.2% worse +# eval2000,fg 17.0 16.9 17.0 0.1% worse + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=13 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_5u # Note: _sp will get added to this if $speed_perturb == true. + +# training options +num_epochs=2 # this is about the same amount of compute as the normal 4, since one + # epoch encompasses all frame-shifts of the data. +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=2.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=300 # doubling it, since we have half the frame rate. +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +# Generate double-frame-rate version of the data with normal window size. +if [ $stage -le 12 ]; then + mfccdir=mfcc + for dataset in eval2000 train_dev ${train_set}; do + utils/copy_data_dir.sh data/$dataset data/${dataset}_hires_dbl2 + steps/make_mfcc.sh --cmd "$train_cmd" --nj 30 --mfcc-config conf/mfcc_hires_dbl2.conf \ + data/${dataset}_hires_dbl2 exp/make_hires_dbl2/$dataset $mfccdir; + steps/compute_cmvn_stats.sh data/${dataset}_hires_dbl2 exp/make_hires_dbl2/$dataset $mfccdir; + utils/fix_data_dir.sh data/${dataset}_hires_dbl2 # remove segments with problems + done +fi + +# Generate double-frame-rate version of the data with smaller than normal window size; +# and only keeping the first 10 MFCC coefficients. +if [ $stage -le 13 ]; then + mfccdir=mfcc + for dataset in eval2000 train_dev ${train_set}; do + utils/copy_data_dir.sh data/$dataset data/${dataset}_dbl3 + steps/make_mfcc.sh --cmd "$train_cmd" --nj 30 --mfcc-config conf/mfcc_dbl3.conf \ + data/${dataset}_dbl3 exp/make_dbl3/$dataset $mfccdir; + utils/fix_data_dir.sh data/${dataset}_dbl3 # remove segments with problems + done +fi + +# select dimension 10-39 of the dbl2 features, then create pasted features consisting +# of the 10 dimensions of the dbl3, plus the selected dimensions 10-39 of dbl2. +if [ $stage -le 14 ]; then + mfccdir=mfcc + for dataset in eval2000 train_dev ${train_set}; do + steps/select_feats.sh --cmd "$train_cmd --max-jobs-run 4" 10-39 data/${dataset}_hires_dbl2 data/${dataset}_hires_dbl2_select \ + exp/make_dbl3/$dataset $mfccdir + rm data/${dataset}_hires_dbl2_select/cmvn.scp 2>/dev/null || true + steps/paste_feats.sh --cmd "$train_cmd --max-jobs-run 4" data/${dataset}_hires_dbl2_select data/${dataset}_dbl3 data/${dataset}_pasted \ + exp/make_dbl3/$dataset $mfccdir + steps/compute_cmvn_stats.sh data/${dataset}_pasted exp/make_dbl3/$dataset $mfccdir; + utils/fix_data_dir.sh data/${dataset}_pasted + done +fi + + +if [ $stage -le 15 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --frame-subsampling-factor 6 \ + --alignment-subsampling-factor 3 \ + --xent-regularize 0.1 \ + --leaky-hmm-coefficient 0.1 \ + --l2-regularize 0.00005 \ + --jesus-opts "--jesus-forward-input-dim 500 --jesus-forward-output-dim 1800 --jesus-hidden-dim 7500 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \ + --splice-indexes "-1,0,1 -2,-1,0,1,2 -2,0,2 -4,-2,0,2 -6,0,6 -6,0,6 -12,-6,0" \ + --apply-deriv-weights false \ + --frames-per-iter 2400000 \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --feat-type raw \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_pasted $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; + + echo "0.005" > $dir/frame_shift # this lets the sclite decoding script know + # what the frame shift was, in seconds. +fi + +if [ $stage -le 16 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 17 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context 20 \ + --nj 50 --cmd "$decode_cmd" \ + $graph_dir data/${decode_set}_pasted $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_pasted \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_5v.sh b/egs/swbd/s5c/local/chain/run_tdnn_5v.sh new file mode 100755 index 00000000000..b33f013b894 --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_5v.sh @@ -0,0 +1,459 @@ +#!/bin/bash + +# _5v is as _5t, but further reducing the --jesus-hidden-dim from 3500 to 2500. + +# WER is almost the same, perhaps <0.1% worse; diagnostics are slightly worse. +# I ended up running it again after I suspected that we had 'got lucky' with +# this particular run (since various experiments using 5v as a starting point +# were failures); that rerun is the 5v2 run. +# +# local/chain/compare_wer.sh 5e 5s 5t 5v 5v2 +# System 5e 5s 5t 5v 5v2 +# WER on train_dev(tg) 15.43 15.47 15.43 15.38 15.74 +# WER on train_dev(fg) 14.32 14.31 14.34 14.39 14.50 +# WER on eval2000(tg) 17.3 17.4 17.4 17.4 17.5 +# WER on eval2000(fg) 15.5 15.6 15.6 15.7 15.9 +# Final train prob -0.110056 -0.110928 -0.110752 -0.11156 -0.112155 +# Final valid prob -0.129184 -0.132139 -0.129123 -0.131797 -0.129516 + + +# _5t is as _5s but further reducing the jesus-hidden-dim (trying to speed it +# up), from 5000 to 3500. + +# about 5s: comparing with 5e which is the most recent baseline we actually +# decoded, 5s is as 5e but with jesus-forward-output-dim reduced 1800->1700, +# jesus-hidden-dim reduced 7500 to 5000, and and the new option +# --self-repair-scale 0.00001 added. Also compare 5t and 5v which have even +# smaller jesus-hidden-dims. + +# _5s is as _5r but increasing the jesus-forward-output-dim to the intermediate +# value of 1700 (between 1500 and 1800), and also a bug-fix in the self-repair +# code to a bug which was doubling the thresholds so there was, in effect, +# no upper threshold. I stopped the p,q,r runs after I found this, but in +# configuring this run I'm bearing in mind the train and valid probs from the +# p,q,r runs. + +# _5r is as _5q but also reducing --jesus-hidden-dim from 7500 to 5000. + +# _5q is as _5p but reducing jesus-forward-output-dim from 1800 to 1500 to try +# to compensate for the fact that more of the output dimensions are now being +# usefully used. + +# _5p is as _5e but adding (new option) --self-repair-scale 0.00001, to repair +# ReLUs that are over or under-saturated. + +# _5e is as _5b, but reducing --xent-regularize from 0.2 to 0.1 (since based on +# the results of 4v, 4w and 5c, it looks like 0.1 is better than 0.2 or 0.05). + +# The improvement is small but consistent (0.1, 0.1, 0.0, 0.1) and also seen +# in the train and valid probs. +#System 5b 5e +#WER on train_dev(tg) 15.51 15.43 +#WER on train_dev(fg) 14.39 14.32 +#WER on eval2000(tg) 17.3 17.3 +#WER on eval2000(fg) 15.6 15.5 +#Final train prob -0.112013 -0.110056 +#Final valid prob -0.130879 -0.129184 + +# _5b is as _5a, but adding --leaky-hmm-coefficient 0.1. + +# It does seem helpful on average: (-0.35, -0.35, -0.1, 0). +#./compare_wer.sh 5a 5b +#System 5a 5b +#WER on train_dev(tg) 15.86 15.51 +#WER on train_dev(fg) 14.74 14.39 +#WER on eval2000(tg) 17.4 17.3 +#WER on eval2000(fg) 15.6 15.6 +#Final train prob -0.0998359 -0.112013 +#Final valid prob -0.115884 -0.130879 + +# _5a is as _4w, but increasing jesus-forward-output-dim from 1400 to 1800, and +# jesus-forward-input-dim from 400 to 500. Hoping that the cross-entropy regularization +# will mean that the increased parameters are now helpful. + +# _4w is as _4v, but doubling --xent-regularize to 0.2 + +# _4v is as _4r, but with --xent-regularize 0.1. Increasing max_param_change +# from 1.0 to 2.0 because there is a lot of parameter change in the final xent +# layer, and this limits the rate of change of the other layers. + +# _4r is as _4f, but one more hidden layer, and reducing context of existing +# layers so we can re-use the egs. Reducing jesus-forward-output-dim slightly +# from 1500 to 1400. + +# This is better than 4f by almost all metrics. +# ./compare_wer.sh 4f 4r +# System 4f 4r +# WER on train_dev(tg) 16.83 16.50 +# WER on train_dev(fg) 15.73 15.45 +# WER on eval2000(tg) 18.4 18.3 +# WER on eval2000(fg) 16.6 16.7 +# Final train prob -0.105832 -0.103652 +# Final valid prob -0.123021 -0.121105 + +# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005. + +# It's even better than 4e, by about 0.3% abs. +# 4c 4e 4f +# Final valid prob: -0.1241 -0.1267 -0.1230 +# Final train prob: -0.08820 -0.1149 -0.1058 + +# ./show_wer.sh 4f +# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0 +# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 4e +# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0 +# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0 +# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys + + +# _4e is as _4c, but adding the option --l2-regularize 0.0001. + +# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000. + +# _4a is as _3s, but using narrower splice-indexes in the first layer. + +# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400. +# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p. +# This of course reduces overtraining. Results are a bit better than 3p but still +# not as good as 2y + +# ./show_wer.sh 3s +# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 3p +# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0 +# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0 +# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys +# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 2y +# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + + +# _3r is as _3p but reducing the number of parameters as it seemed to be +# overtraining (despite already being quite a small model): [600,1800 -> +# 500,1500]. Also in the interim there was a script change to +# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change. +# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent +# with the halving of the minibatch size.] + + +# _3p is the same as 3o, but after a code and script change so we can use +# natural gradient for the RepeatedAffineComponent. +# [natural gradient was helpful, based on logs; +# also made a change to use positive bias for the jesus-component affine parts.] + +# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2. + +# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support +# recurrence, with improvements to the learning of the jesus layers. + +# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found +# to be worse. +# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence +# is helpful.] +#./show_wer.sh 3g +#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys +#a03:s5c: ./show_wer.sh 2y +#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +#a03:s5c: ./show_wer.sh 3d +#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0 +#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +# _3f is as _3e, but modifying the splicing setup to add (left) recurrence: +# added the :3's in --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" +# Therefore it's +# no longer really a tdnn, more like an RNN combined with TDNN. BTW, I'm not re-dumping egs with extra +# context, and this isn't really ideal - I want to see if this seems promising first. + +# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default) +# to 200 in order to reduce computation in the Jesus layer. + +# _3d is as _2y, and re-using the egs, but using --jesus-opts and +# configs from make_jesus_configs.py. +# --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ +# --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" + +# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from +# 800k to 1.2 million. The aim is to avoid some of the per-job overhead +# (model-averaging, etc.), since each iteration takes only a minute or so. +# I added the results to the table below. It seems the same on average- +# which is good. We'll probably keep this configuration. + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# note: when I compare with the rerun of 2o (not shown), this run is actually +# better. +# WER on 2m 2o 2y [ now comparing 2o->2y:] +# train_dev,tg 17.22 17.24 16.99 0.2% better +# train_dev,fg 15.87 15.93 15.86 0.1% better +# eval2000,tg 18.7 18.7 18.9 0.2% worse +# eval2000,fg 17.0 16.9 17.0 0.1% worse + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_5v # Note: _sp will get added to this if $speed_perturb == true. + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=2.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --xent-regularize 0.1 \ + --leaky-hmm-coefficient 0.1 \ + --l2-regularize 0.00005 \ + --egs-dir exp/chain/tdnn_2y_sp/egs \ + --jesus-opts "--jesus-forward-input-dim 500 --jesus-forward-output-dim 1700 --jesus-hidden-dim 2500 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25 --self-repair-scale 0.00001" \ + --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3 -3,0,3 -6,-3,0" \ + --apply-deriv-weights false \ + --frames-per-iter 1200000 \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context 20 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_5w.sh b/egs/swbd/s5c/local/chain/run_tdnn_5w.sh new file mode 100755 index 00000000000..1a40acfa105 --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_5w.sh @@ -0,0 +1,469 @@ +#!/bin/bash + +# _5w is as _5k (which is a fairly good-performing ivector-free model), but +# making the same changes as 5e -> 5t, which makes the model more lightweight +# and faster to train, specifically: reduce --jesus-hidden-dim from 7500 to +# 3500, add --self-repair-scale 0.00001, and reduce --jesus-forward-output-dim +# from 1800 to 1700. + +# Difference is tiny. +#local/chain/compare_wer.sh 5k 5w +#System 5k 5w +#WER on train_dev(tg) 16.46 16.56 +#WER on train_dev(fg) 15.17 15.30 +#WER on eval2000(tg) 18.1 18.1 +#WER on eval2000(fg) 16.5 16.4 +#Final train prob -0.105502 -0.106549 +#Final valid prob -0.12337 -0.120079 + +# _5k is as _5j (omitting iVectors), and adding a statistics-extraction layer +# in the middle, like 5e->5g, to see whether it recovers some of the improvement +# of using the iVectors. + +# It recovers half of the improvement-- but the objf is better than +# we might expect. I think it's learning some phonetic stuff too. +# +#./compare_wer.sh 5e 5j 5k +#System 5e 5j 5k +#WER on train_dev(tg) 15.43 17.59 16.46 +#WER on train_dev(fg) 14.32 16.33 15.17 +#WER on eval2000(tg) 17.3 19.1 18.1 +#WER on eval2000(fg) 15.5 17.5 16.5 +#Final train prob -0.110056 -0.114691 -0.105502 +#Final valid prob -0.129184 -0.130761 -0.12337 + +# The following is decoding with the default frames-per-chunk of 50, and +# --extra-left-context 20. +#./compare_wer.sh 5e 5j 5k +#System 5e 5j 5k +#WER on train_dev(tg) 15.43 17.59 17.37 +#WER on train_dev(fg) 14.32 16.33 16.09 +#WER on eval2000(tg) 17.3 19.1 18.8 +#WER on eval2000(fg) 15.5 17.5 17.3 +#Final train prob -0.110056 -0.114691 -0.105502 +#Final valid prob -0.129184 -0.130761 -0.12337 + +# _5j is as _5e, but omitting the iVectors. + +# Definitely worse, although curiously, there is very little effect on the valid prob. +#./compare_wer.sh 5e 5j +#System 5e 5j +#WER on train_dev(tg) 15.43 17.59 +#WER on train_dev(fg) 14.32 16.33 +#WER on eval2000(tg) 17.3 19.1 +#WER on eval2000(fg) 15.5 17.5 +#Final train prob -0.110056 -0.114691 +#Final valid prob -0.129184 -0.130761 + +# _5e is as _5b, but reducing --xent-regularize from 0.2 to 0.1 (since based on +# the results of 4v, 4w and 5c, it looks like 0.1 is better than 0.2 or 0.05). + +# The improvement is small but consistent (0.1, 0.1, 0.0, 0.1) and also seen +# in the train and valid probs. +#System 5b 5e +#WER on train_dev(tg) 15.51 15.43 +#WER on train_dev(fg) 14.39 14.32 +#WER on eval2000(tg) 17.3 17.3 +#WER on eval2000(fg) 15.6 15.5 +#Final train prob -0.112013 -0.110056 +#Final valid prob -0.130879 -0.129184 + +# _5b is as _5a, but adding --leaky-hmm-coefficient 0.1. + +# It does seem helpful on average: (-0.35, -0.35, -0.1, 0). +#./compare_wer.sh 5a 5b +#System 5a 5b +#WER on train_dev(tg) 15.86 15.51 +#WER on train_dev(fg) 14.74 14.39 +#WER on eval2000(tg) 17.4 17.3 +#WER on eval2000(fg) 15.6 15.6 +#Final train prob -0.0998359 -0.112013 +#Final valid prob -0.115884 -0.130879 + +# _5a is as _4w, but increasing jesus-forward-output-dim from 1400 to 1800, and +# jesus-forward-input-dim from 400 to 500. Hoping that the cross-entropy regularization +# will mean that the increased parameters are now helpful. + +# _4w is as _4v, but doubling --xent-regularize to 0.2 + +# _4v is as _4r, but with --xent-regularize 0.1. Increasing max_param_change +# from 1.0 to 2.0 because there is a lot of parameter change in the final xent +# layer, and this limits the rate of change of the other layers. + +# _4r is as _4f, but one more hidden layer, and reducing context of existing +# layers so we can re-use the egs. Reducing jesus-forward-output-dim slightly +# from 1500 to 1400. + +# This is better than 4f by almost all metrics. +# ./compare_wer.sh 4f 4r +# System 4f 4r +# WER on train_dev(tg) 16.83 16.50 +# WER on train_dev(fg) 15.73 15.45 +# WER on eval2000(tg) 18.4 18.3 +# WER on eval2000(fg) 16.6 16.7 +# Final train prob -0.105832 -0.103652 +# Final valid prob -0.123021 -0.121105 + +# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005. + +# It's even better than 4e, by about 0.3% abs. +# 4c 4e 4f +# Final valid prob: -0.1241 -0.1267 -0.1230 +# Final train prob: -0.08820 -0.1149 -0.1058 + +# ./show_wer.sh 4f +# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0 +# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 4e +# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0 +# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0 +# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys + + +# _4e is as _4c, but adding the option --l2-regularize 0.0001. + +# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000. + +# _4a is as _3s, but using narrower splice-indexes in the first layer. + +# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400. +# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p. +# This of course reduces overtraining. Results are a bit better than 3p but still +# not as good as 2y + +# ./show_wer.sh 3s +# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 3p +# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0 +# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0 +# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys +# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 2y +# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + + +# _3r is as _3p but reducing the number of parameters as it seemed to be +# overtraining (despite already being quite a small model): [600,1800 -> +# 500,1500]. Also in the interim there was a script change to +# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change. +# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent +# with the halving of the minibatch size.] + + +# _3p is the same as 3o, but after a code and script change so we can use +# natural gradient for the RepeatedAffineComponent. +# [natural gradient was helpful, based on logs; +# also made a change to use positive bias for the jesus-component affine parts.] + +# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2. + +# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support +# recurrence, with improvements to the learning of the jesus layers. + +# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found +# to be worse. +# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence +# is helpful.] +#./show_wer.sh 3g +#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys +#a03:s5c: ./show_wer.sh 2y +#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +#a03:s5c: ./show_wer.sh 3d +#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0 +#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +# _3f is as _3e, but modifying the splicing setup to add (left) recurrence: +# added the :3's in --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" +# Therefore it's +# no longer really a tdnn, more like an RNN combined with TDNN. BTW, I'm not re-dumping egs with extra +# context, and this isn't really ideal - I want to see if this seems promising first. + +# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default) +# to 200 in order to reduce computation in the Jesus layer. + +# _3d is as _2y, and re-using the egs, but using --jesus-opts and +# configs from make_jesus_configs.py. +# --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ +# --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" + +# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from +# 800k to 1.2 million. The aim is to avoid some of the per-job overhead +# (model-averaging, etc.), since each iteration takes only a minute or so. +# I added the results to the table below. It seems the same on average- +# which is good. We'll probably keep this configuration. + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# note: when I compare with the rerun of 2o (not shown), this run is actually +# better. +# WER on 2m 2o 2y [ now comparing 2o->2y:] +# train_dev,tg 17.22 17.24 16.99 0.2% better +# train_dev,fg 15.87 15.93 15.86 0.1% better +# eval2000,tg 18.7 18.7 18.9 0.2% worse +# eval2000,fg 17.0 16.9 17.0 0.1% worse + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_5w # Note: _sp will get added to this if $speed_perturb == true. + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=2.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --xent-regularize 0.1 \ + --leaky-hmm-coefficient 0.1 \ + --l2-regularize 0.00005 \ + --jesus-opts "--jesus-forward-input-dim 500 --jesus-forward-output-dim 1700 --jesus-hidden-dim 3500 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25 --self-repair-scale 0.00001" \ + --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3,mean+stddev(-99:3:9:99) -3,0,3 -6,-3,0" \ + --apply-deriv-weights false \ + --frames-per-iter 1200000 \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --feat-type raw \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --frames-per-chunk 300 \ + --nj 50 --cmd "$decode_cmd" \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_5x.sh b/egs/swbd/s5c/local/chain/run_tdnn_5x.sh new file mode 100755 index 00000000000..88dc28c2354 --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_5x.sh @@ -0,0 +1,476 @@ +#!/bin/bash + +# _5x is as _5w but decreasing the context of the averaging layer from +-0.99 +# seconds to +-0.66 seconds. I would not have expected this to work a priori, +# but the change from 5k -> 5l, which made the context wider, made WERs slightly +# worse, so I'd like to see what happens when we decrease the context. + +# It's worse. Odd because increasing the context (5k->5l) seemed to be a little +# worse also. +local/chain/compare_wer.sh 5w 5x +#System 5w 5x +#WER on train_dev(tg) 16.56 16.66 +#WER on train_dev(fg) 15.30 15.41 +#WER on eval2000(tg) 18.1 18.5 +#WER on eval2000(fg) 16.4 16.6 +#Final train prob -0.106549 -0.105693 +#Final valid prob -0.120079 -0.121834 + +# _5w is as _5k (which is a fairly good-performing ivector-free model), but +# making the same changes as 5e -> 5t, which makes the model more lightweight +# and faster to train, specifically: reduce --jesus-hidden-dim from 7500 to +# 3500, add --self-repair-scale 0.00001, and reduce --jesus-forward-output-dim +# from 1800 to 1700. + +# _5k is as _5j (omitting iVectors), and adding a statistics-extraction layer +# in the middle, like 5e->5g, to see whether it recovers some of the improvement +# of using the iVectors. + +# It recovers half of the improvement-- but the objf is better than +# we might expect. I think it's learning some phonetic stuff too. +# +#./compare_wer.sh 5e 5j 5k +#System 5e 5j 5k +#WER on train_dev(tg) 15.43 17.59 16.46 +#WER on train_dev(fg) 14.32 16.33 15.17 +#WER on eval2000(tg) 17.3 19.1 18.1 +#WER on eval2000(fg) 15.5 17.5 16.5 +#Final train prob -0.110056 -0.114691 -0.105502 +#Final valid prob -0.129184 -0.130761 -0.12337 + +# The following is decoding with the default frames-per-chunk of 50, and +# --extra-left-context 20. +#./compare_wer.sh 5e 5j 5k +#System 5e 5j 5k +#WER on train_dev(tg) 15.43 17.59 17.37 +#WER on train_dev(fg) 14.32 16.33 16.09 +#WER on eval2000(tg) 17.3 19.1 18.8 +#WER on eval2000(fg) 15.5 17.5 17.3 +#Final train prob -0.110056 -0.114691 -0.105502 +#Final valid prob -0.129184 -0.130761 -0.12337 + +# _5j is as _5e, but omitting the iVectors. + +# Definitely worse, although curiously, there is very little effect on the valid prob. +#./compare_wer.sh 5e 5j +#System 5e 5j +#WER on train_dev(tg) 15.43 17.59 +#WER on train_dev(fg) 14.32 16.33 +#WER on eval2000(tg) 17.3 19.1 +#WER on eval2000(fg) 15.5 17.5 +#Final train prob -0.110056 -0.114691 +#Final valid prob -0.129184 -0.130761 + +# _5e is as _5b, but reducing --xent-regularize from 0.2 to 0.1 (since based on +# the results of 4v, 4w and 5c, it looks like 0.1 is better than 0.2 or 0.05). + +# The improvement is small but consistent (0.1, 0.1, 0.0, 0.1) and also seen +# in the train and valid probs. +#System 5b 5e +#WER on train_dev(tg) 15.51 15.43 +#WER on train_dev(fg) 14.39 14.32 +#WER on eval2000(tg) 17.3 17.3 +#WER on eval2000(fg) 15.6 15.5 +#Final train prob -0.112013 -0.110056 +#Final valid prob -0.130879 -0.129184 + +# _5b is as _5a, but adding --leaky-hmm-coefficient 0.1. + +# It does seem helpful on average: (-0.35, -0.35, -0.1, 0). +#./compare_wer.sh 5a 5b +#System 5a 5b +#WER on train_dev(tg) 15.86 15.51 +#WER on train_dev(fg) 14.74 14.39 +#WER on eval2000(tg) 17.4 17.3 +#WER on eval2000(fg) 15.6 15.6 +#Final train prob -0.0998359 -0.112013 +#Final valid prob -0.115884 -0.130879 + +# _5a is as _4w, but increasing jesus-forward-output-dim from 1400 to 1800, and +# jesus-forward-input-dim from 400 to 500. Hoping that the cross-entropy regularization +# will mean that the increased parameters are now helpful. + +# _4w is as _4v, but doubling --xent-regularize to 0.2 + +# _4v is as _4r, but with --xent-regularize 0.1. Increasing max_param_change +# from 1.0 to 2.0 because there is a lot of parameter change in the final xent +# layer, and this limits the rate of change of the other layers. + +# _4r is as _4f, but one more hidden layer, and reducing context of existing +# layers so we can re-use the egs. Reducing jesus-forward-output-dim slightly +# from 1500 to 1400. + +# This is better than 4f by almost all metrics. +# ./compare_wer.sh 4f 4r +# System 4f 4r +# WER on train_dev(tg) 16.83 16.50 +# WER on train_dev(fg) 15.73 15.45 +# WER on eval2000(tg) 18.4 18.3 +# WER on eval2000(fg) 16.6 16.7 +# Final train prob -0.105832 -0.103652 +# Final valid prob -0.123021 -0.121105 + +# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005. + +# It's even better than 4e, by about 0.3% abs. +# 4c 4e 4f +# Final valid prob: -0.1241 -0.1267 -0.1230 +# Final train prob: -0.08820 -0.1149 -0.1058 + +# ./show_wer.sh 4f +# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0 +# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 4e +# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0 +# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0 +# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys + + +# _4e is as _4c, but adding the option --l2-regularize 0.0001. + +# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000. + +# _4a is as _3s, but using narrower splice-indexes in the first layer. + +# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400. +# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p. +# This of course reduces overtraining. Results are a bit better than 3p but still +# not as good as 2y + +# ./show_wer.sh 3s +# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 3p +# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0 +# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0 +# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys +# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 2y +# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + + +# _3r is as _3p but reducing the number of parameters as it seemed to be +# overtraining (despite already being quite a small model): [600,1800 -> +# 500,1500]. Also in the interim there was a script change to +# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change. +# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent +# with the halving of the minibatch size.] + + +# _3p is the same as 3o, but after a code and script change so we can use +# natural gradient for the RepeatedAffineComponent. +# [natural gradient was helpful, based on logs; +# also made a change to use positive bias for the jesus-component affine parts.] + +# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2. + +# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support +# recurrence, with improvements to the learning of the jesus layers. + +# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found +# to be worse. +# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence +# is helpful.] +#./show_wer.sh 3g +#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys +#a03:s5c: ./show_wer.sh 2y +#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +#a03:s5c: ./show_wer.sh 3d +#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0 +#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +# _3f is as _3e, but modifying the splicing setup to add (left) recurrence: +# added the :3's in --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" +# Therefore it's +# no longer really a tdnn, more like an RNN combined with TDNN. BTW, I'm not re-dumping egs with extra +# context, and this isn't really ideal - I want to see if this seems promising first. + +# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default) +# to 200 in order to reduce computation in the Jesus layer. + +# _3d is as _2y, and re-using the egs, but using --jesus-opts and +# configs from make_jesus_configs.py. +# --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ +# --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" + +# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from +# 800k to 1.2 million. The aim is to avoid some of the per-job overhead +# (model-averaging, etc.), since each iteration takes only a minute or so. +# I added the results to the table below. It seems the same on average- +# which is good. We'll probably keep this configuration. + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# note: when I compare with the rerun of 2o (not shown), this run is actually +# better. +# WER on 2m 2o 2y [ now comparing 2o->2y:] +# train_dev,tg 17.22 17.24 16.99 0.2% better +# train_dev,fg 15.87 15.93 15.86 0.1% better +# eval2000,tg 18.7 18.7 18.9 0.2% worse +# eval2000,fg 17.0 16.9 17.0 0.1% worse + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_5x # Note: _sp will get added to this if $speed_perturb == true. + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=2.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --egs-dir exp/chain/tdnn_5w_sp/egs \ + --xent-regularize 0.1 \ + --leaky-hmm-coefficient 0.1 \ + --l2-regularize 0.00005 \ + --jesus-opts "--jesus-forward-input-dim 500 --jesus-forward-output-dim 1700 --jesus-hidden-dim 3500 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25 --self-repair-scale 0.00001" \ + --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3,mean+stddev(-63:3:9:63) -3,0,3 -6,-3,0" \ + --apply-deriv-weights false \ + --frames-per-iter 1200000 \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --feat-type raw \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --frames-per-chunk 300 \ + --nj 50 --cmd "$decode_cmd" \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_5y.sh b/egs/swbd/s5c/local/chain/run_tdnn_5y.sh new file mode 100755 index 00000000000..54769c23734 --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_5y.sh @@ -0,0 +1,476 @@ +#!/bin/bash + +# _5y is as _5v, but rebalancing the network to have fewer parameters in the +# final layer and more in the hidden parts, by reducing --final-hidden-dim from 500 +# (it defaults to --jesus-forward-hidden-dim) to 400, and increasing +# --jesus-forward-input-dim from 500 to 600 and +# --jesus-forward-output-dim from 1700 to 1800, +# and --jesus-hidden-dim from 2500 to 3000 (note: I don't really expect this last change +# to make much of a difference). +# Very roughly, we're moving about a million parameters from the final layer to the +# hidden parts of the network. Hopefully this will reduce overtraining, since +# the hidden parts of the network are regularized by the --xent-regularize option. + +# The diagnostics were improved, but the WER is no better (or maybe slightly worse). +#local/chain/compare_wer.sh 5v 5y +#System 5v 5y +#WER on train_dev(tg) 15.38 15.50 +#WER on train_dev(fg) 14.39 14.37 +#WER on eval2000(tg) 17.4 17.5 +#WER on eval2000(fg) 15.7 15.7 +#Final train prob -0.11156 -0.111636 +#Final valid prob -0.131797 -0.128892 + +# _5v is as _5t, but further reducing the --jesus-hidden-dim from 3500 to 2500. + +# WER is almost the same, perhaps <0.1% worse; diagnostics are slightly worse. +# +#local/chain/compare_wer.sh 5e 5s 5t 5v +#System 5e 5s 5t 5v +#WER on train_dev(tg) 15.43 15.47 15.43 15.38 +#WER on train_dev(fg) 14.32 14.31 14.34 14.39 +#WER on eval2000(tg) 17.3 17.4 17.4 17.4 +#WER on eval2000(fg) 15.5 15.6 15.6 15.7 +#Final train prob -0.110056 -0.110928 -0.110752 -0.11156 +#Final valid prob -0.129184 -0.132139 -0.129123 -0.131797 + +# _5t is as _5s but further reducing the jesus-hidden-dim (trying to speed it +# up), from 5000 to 3500. + +# about 5s: comparing with 5e which is the most recent baseline we actually +# decoded, 5s is as 5e but with jesus-forward-output-dim reduced 1800->1700, +# jesus-hidden-dim reduced 7500 to 5000, and and the new option +# --self-repair-scale 0.00001 added. Also compare 5t and 5v which have even +# smaller jesus-hidden-dims. + +# _5s is as _5r but increasing the jesus-forward-output-dim to the intermediate +# value of 1700 (between 1500 and 1800), and also a bug-fix in the self-repair +# code to a bug which was doubling the thresholds so there was, in effect, +# no upper threshold. I stopped the p,q,r runs after I found this, but in +# configuring this run I'm bearing in mind the train and valid probs from the +# p,q,r runs. + +# _5r is as _5q but also reducing --jesus-hidden-dim from 7500 to 5000. + +# _5q is as _5p but reducing jesus-forward-output-dim from 1800 to 1500 to try +# to compensate for the fact that more of the output dimensions are now being +# usefully used. + +# _5p is as _5e but adding (new option) --self-repair-scale 0.00001, to repair +# ReLUs that are over or under-saturated. + +# _5e is as _5b, but reducing --xent-regularize from 0.2 to 0.1 (since based on +# the results of 4v, 4w and 5c, it looks like 0.1 is better than 0.2 or 0.05). + +# The improvement is small but consistent (0.1, 0.1, 0.0, 0.1) and also seen +# in the train and valid probs. +#System 5b 5e +#WER on train_dev(tg) 15.51 15.43 +#WER on train_dev(fg) 14.39 14.32 +#WER on eval2000(tg) 17.3 17.3 +#WER on eval2000(fg) 15.6 15.5 +#Final train prob -0.112013 -0.110056 +#Final valid prob -0.130879 -0.129184 + +# _5b is as _5a, but adding --leaky-hmm-coefficient 0.1. + +# It does seem helpful on average: (-0.35, -0.35, -0.1, 0). +#./compare_wer.sh 5a 5b +#System 5a 5b +#WER on train_dev(tg) 15.86 15.51 +#WER on train_dev(fg) 14.74 14.39 +#WER on eval2000(tg) 17.4 17.3 +#WER on eval2000(fg) 15.6 15.6 +#Final train prob -0.0998359 -0.112013 +#Final valid prob -0.115884 -0.130879 + +# _5a is as _4w, but increasing jesus-forward-output-dim from 1400 to 1800, and +# jesus-forward-input-dim from 400 to 500. Hoping that the cross-entropy regularization +# will mean that the increased parameters are now helpful. + +# _4w is as _4v, but doubling --xent-regularize to 0.2 + +# _4v is as _4r, but with --xent-regularize 0.1. Increasing max_param_change +# from 1.0 to 2.0 because there is a lot of parameter change in the final xent +# layer, and this limits the rate of change of the other layers. + +# _4r is as _4f, but one more hidden layer, and reducing context of existing +# layers so we can re-use the egs. Reducing jesus-forward-output-dim slightly +# from 1500 to 1400. + +# This is better than 4f by almost all metrics. +# ./compare_wer.sh 4f 4r +# System 4f 4r +# WER on train_dev(tg) 16.83 16.50 +# WER on train_dev(fg) 15.73 15.45 +# WER on eval2000(tg) 18.4 18.3 +# WER on eval2000(fg) 16.6 16.7 +# Final train prob -0.105832 -0.103652 +# Final valid prob -0.123021 -0.121105 + +# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005. + +# It's even better than 4e, by about 0.3% abs. +# 4c 4e 4f +# Final valid prob: -0.1241 -0.1267 -0.1230 +# Final train prob: -0.08820 -0.1149 -0.1058 + +# ./show_wer.sh 4f +# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0 +# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 4e +# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0 +# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0 +# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys + + +# _4e is as _4c, but adding the option --l2-regularize 0.0001. + +# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000. + +# _4a is as _3s, but using narrower splice-indexes in the first layer. + +# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400. +# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p. +# This of course reduces overtraining. Results are a bit better than 3p but still +# not as good as 2y + +# ./show_wer.sh 3s +# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 3p +# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0 +# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0 +# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys +# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 2y +# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + + +# _3r is as _3p but reducing the number of parameters as it seemed to be +# overtraining (despite already being quite a small model): [600,1800 -> +# 500,1500]. Also in the interim there was a script change to +# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change. +# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent +# with the halving of the minibatch size.] + + +# _3p is the same as 3o, but after a code and script change so we can use +# natural gradient for the RepeatedAffineComponent. +# [natural gradient was helpful, based on logs; +# also made a change to use positive bias for the jesus-component affine parts.] + +# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2. + +# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support +# recurrence, with improvements to the learning of the jesus layers. + +# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found +# to be worse. +# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence +# is helpful.] +#./show_wer.sh 3g +#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys +#a03:s5c: ./show_wer.sh 2y +#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +#a03:s5c: ./show_wer.sh 3d +#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0 +#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +# _3f is as _3e, but modifying the splicing setup to add (left) recurrence: +# added the :3's in --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" +# Therefore it's +# no longer really a tdnn, more like an RNN combined with TDNN. BTW, I'm not re-dumping egs with extra +# context, and this isn't really ideal - I want to see if this seems promising first. + +# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default) +# to 200 in order to reduce computation in the Jesus layer. + +# _3d is as _2y, and re-using the egs, but using --jesus-opts and +# configs from make_jesus_configs.py. +# --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ +# --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" + +# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from +# 800k to 1.2 million. The aim is to avoid some of the per-job overhead +# (model-averaging, etc.), since each iteration takes only a minute or so. +# I added the results to the table below. It seems the same on average- +# which is good. We'll probably keep this configuration. + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# note: when I compare with the rerun of 2o (not shown), this run is actually +# better. +# WER on 2m 2o 2y [ now comparing 2o->2y:] +# train_dev,tg 17.22 17.24 16.99 0.2% better +# train_dev,fg 15.87 15.93 15.86 0.1% better +# eval2000,tg 18.7 18.7 18.9 0.2% worse +# eval2000,fg 17.0 16.9 17.0 0.1% worse + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_5y # Note: _sp will get added to this if $speed_perturb == true. + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=2.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --xent-regularize 0.1 \ + --leaky-hmm-coefficient 0.1 \ + --l2-regularize 0.00005 \ + --egs-dir exp/chain/tdnn_2y_sp/egs \ + --jesus-opts "--jesus-forward-input-dim 600 --jesus-forward-output-dim 1800 --final-hidden-dim 400 --jesus-hidden-dim 3000 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25 --self-repair-scale 0.00001" \ + --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3 -3,0,3 -6,-3,0" \ + --apply-deriv-weights false \ + --frames-per-iter 1200000 \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context 20 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_5z.sh b/egs/swbd/s5c/local/chain/run_tdnn_5z.sh new file mode 100755 index 00000000000..94843bfa2c9 --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_5z.sh @@ -0,0 +1,468 @@ +#!/bin/bash + +# _5z is as _5v, but adding skip-splicing (a new configuration option) +# It seems not helpful. I'll remove the option soon. +# note: 5v2 is a rerun of 5v. + +# local/chain/compare_wer.sh 5v 5v2 5z +# System 5v 5v2 5z +# WER on train_dev(tg) 15.38 15.74 15.60 +# WER on train_dev(fg) 14.39 14.50 14.50 +# WER on eval2000(tg) 17.4 17.5 17.6 +# WER on eval2000(fg) 15.7 15.9 15.9 +# Final train prob -0.11156 -0.112155 -0.113823 +# Final valid prob -0.131797 -0.129516 -0.131356 + +# _5v is as _5t, but further reducing the --jesus-hidden-dim from 3500 to 2500. + +# WER is almost the same, perhaps <0.1% worse; diagnostics are slightly worse. +# +#local/chain/compare_wer.sh 5e 5s 5t 5v +#System 5e 5s 5t 5v +#WER on train_dev(tg) 15.43 15.47 15.43 15.38 +#WER on train_dev(fg) 14.32 14.31 14.34 14.39 +#WER on eval2000(tg) 17.3 17.4 17.4 17.4 +#WER on eval2000(fg) 15.5 15.6 15.6 15.7 +#Final train prob -0.110056 -0.110928 -0.110752 -0.11156 +#Final valid prob -0.129184 -0.132139 -0.129123 -0.131797 + +# _5t is as _5s but further reducing the jesus-hidden-dim (trying to speed it +# up), from 5000 to 3500. + +# about 5s: comparing with 5e which is the most recent baseline we actually +# decoded, 5s is as 5e but with jesus-forward-output-dim reduced 1800->1700, +# jesus-hidden-dim reduced 7500 to 5000, and and the new option +# --self-repair-scale 0.00001 added. Also compare 5t and 5v which have even +# smaller jesus-hidden-dims. + +# _5s is as _5r but increasing the jesus-forward-output-dim to the intermediate +# value of 1700 (between 1500 and 1800), and also a bug-fix in the self-repair +# code to a bug which was doubling the thresholds so there was, in effect, +# no upper threshold. I stopped the p,q,r runs after I found this, but in +# configuring this run I'm bearing in mind the train and valid probs from the +# p,q,r runs. + +# _5r is as _5q but also reducing --jesus-hidden-dim from 7500 to 5000. + +# _5q is as _5p but reducing jesus-forward-output-dim from 1800 to 1500 to try +# to compensate for the fact that more of the output dimensions are now being +# usefully used. + +# _5p is as _5e but adding (new option) --self-repair-scale 0.00001, to repair +# ReLUs that are over or under-saturated. + +# _5e is as _5b, but reducing --xent-regularize from 0.2 to 0.1 (since based on +# the results of 4v, 4w and 5c, it looks like 0.1 is better than 0.2 or 0.05). + +# The improvement is small but consistent (0.1, 0.1, 0.0, 0.1) and also seen +# in the train and valid probs. +#System 5b 5e +#WER on train_dev(tg) 15.51 15.43 +#WER on train_dev(fg) 14.39 14.32 +#WER on eval2000(tg) 17.3 17.3 +#WER on eval2000(fg) 15.6 15.5 +#Final train prob -0.112013 -0.110056 +#Final valid prob -0.130879 -0.129184 + +# _5b is as _5a, but adding --leaky-hmm-coefficient 0.1. + +# It does seem helpful on average: (-0.35, -0.35, -0.1, 0). +#./compare_wer.sh 5a 5b +#System 5a 5b +#WER on train_dev(tg) 15.86 15.51 +#WER on train_dev(fg) 14.74 14.39 +#WER on eval2000(tg) 17.4 17.3 +#WER on eval2000(fg) 15.6 15.6 +#Final train prob -0.0998359 -0.112013 +#Final valid prob -0.115884 -0.130879 + +# _5a is as _4w, but increasing jesus-forward-output-dim from 1400 to 1800, and +# jesus-forward-input-dim from 400 to 500. Hoping that the cross-entropy regularization +# will mean that the increased parameters are now helpful. + +# _4w is as _4v, but doubling --xent-regularize to 0.2 + +# _4v is as _4r, but with --xent-regularize 0.1. Increasing max_param_change +# from 1.0 to 2.0 because there is a lot of parameter change in the final xent +# layer, and this limits the rate of change of the other layers. + +# _4r is as _4f, but one more hidden layer, and reducing context of existing +# layers so we can re-use the egs. Reducing jesus-forward-output-dim slightly +# from 1500 to 1400. + +# This is better than 4f by almost all metrics. +# ./compare_wer.sh 4f 4r +# System 4f 4r +# WER on train_dev(tg) 16.83 16.50 +# WER on train_dev(fg) 15.73 15.45 +# WER on eval2000(tg) 18.4 18.3 +# WER on eval2000(fg) 16.6 16.7 +# Final train prob -0.105832 -0.103652 +# Final valid prob -0.123021 -0.121105 + +# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005. + +# It's even better than 4e, by about 0.3% abs. +# 4c 4e 4f +# Final valid prob: -0.1241 -0.1267 -0.1230 +# Final train prob: -0.08820 -0.1149 -0.1058 + +# ./show_wer.sh 4f +# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0 +# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 4e +# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0 +# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0 +# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys + + +# _4e is as _4c, but adding the option --l2-regularize 0.0001. + +# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000. + +# _4a is as _3s, but using narrower splice-indexes in the first layer. + +# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400. +# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p. +# This of course reduces overtraining. Results are a bit better than 3p but still +# not as good as 2y + +# ./show_wer.sh 3s +# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 3p +# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0 +# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0 +# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys +# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 2y +# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + + +# _3r is as _3p but reducing the number of parameters as it seemed to be +# overtraining (despite already being quite a small model): [600,1800 -> +# 500,1500]. Also in the interim there was a script change to +# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change. +# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent +# with the halving of the minibatch size.] + + +# _3p is the same as 3o, but after a code and script change so we can use +# natural gradient for the RepeatedAffineComponent. +# [natural gradient was helpful, based on logs; +# also made a change to use positive bias for the jesus-component affine parts.] + +# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2. + +# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support +# recurrence, with improvements to the learning of the jesus layers. + +# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found +# to be worse. +# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence +# is helpful.] +#./show_wer.sh 3g +#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys +#a03:s5c: ./show_wer.sh 2y +#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +#a03:s5c: ./show_wer.sh 3d +#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0 +#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +# _3f is as _3e, but modifying the splicing setup to add (left) recurrence: +# added the :3's in --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" +# Therefore it's +# no longer really a tdnn, more like an RNN combined with TDNN. BTW, I'm not re-dumping egs with extra +# context, and this isn't really ideal - I want to see if this seems promising first. + +# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default) +# to 200 in order to reduce computation in the Jesus layer. + +# _3d is as _2y, and re-using the egs, but using --jesus-opts and +# configs from make_jesus_configs.py. +# --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ +# --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" + +# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from +# 800k to 1.2 million. The aim is to avoid some of the per-job overhead +# (model-averaging, etc.), since each iteration takes only a minute or so. +# I added the results to the table below. It seems the same on average- +# which is good. We'll probably keep this configuration. + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# note: when I compare with the rerun of 2o (not shown), this run is actually +# better. +# WER on 2m 2o 2y [ now comparing 2o->2y:] +# train_dev,tg 17.22 17.24 16.99 0.2% better +# train_dev,fg 15.87 15.93 15.86 0.1% better +# eval2000,tg 18.7 18.7 18.9 0.2% worse +# eval2000,fg 17.0 16.9 17.0 0.1% worse + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_5z # Note: _sp will get added to this if $speed_perturb == true. + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=2.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --xent-regularize 0.1 \ + --leaky-hmm-coefficient 0.1 \ + --l2-regularize 0.00005 \ + --egs-dir exp/chain/tdnn_2y_sp/egs \ + --jesus-opts "--jesus-forward-input-dim 500 --jesus-forward-output-dim 1700 --jesus-hidden-dim 2500 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25 --self-repair-scale 0.00001" \ + --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3,skip0 -3,0,3,skip0 -3,0,3,skip0 -6,-3,0,skip-3" \ + --apply-deriv-weights false \ + --frames-per-iter 1200000 \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context 20 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_6a.sh b/egs/swbd/s5c/local/chain/run_tdnn_6a.sh new file mode 100755 index 00000000000..c618d1c0adf --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_6a.sh @@ -0,0 +1,490 @@ +#!/bin/bash + +# _6a is as _5y, where we keep the hidden parts of the network a bit larger +# but take the final-hidden-dim back up to 500, which is the same as what +# it was in 5v. + +# No better. +#local/chain/compare_wer.sh 5v 6a +#System 5v 6a +#WER on train_dev(tg) 15.38 15.49 +#WER on train_dev(fg) 14.39 14.30 +#WER on eval2000(tg) 17.4 17.5 +#WER on eval2000(fg) 15.7 15.9 +#Final train prob -0.11156 -0.109471 +#Final valid prob -0.131797 -0.129035 + +# _5y is as _5v, but rebalancing the network to have fewer parameters in the +# final layer and more in the hidden parts, by reducing --final-hidden-dim from 500 +# (it defaults to --jesus-forward-hidden-dim) to 400, and increasing +# --jesus-forward-input-dim from 500 to 600 and +# --jesus-forward-output-dim from 1700 to 1800, +# and --jesus-hidden-dim from 2500 to 3000 (note: I don't really expect this last change +# to make much of a difference). +# Very roughly, we're moving about a million parameters from the final layer to the +# hidden parts of the network. Hopefully this will reduce overtraining, since +# the hidden parts of the network are regularized by the --xent-regularize option. + +# The diagnostics were improved, but the WER is no better (or maybe slightly worse). +#local/chain/compare_wer.sh 5v 5y +#System 5v 5y +#WER on train_dev(tg) 15.38 15.50 +#WER on train_dev(fg) 14.39 14.37 +#WER on eval2000(tg) 17.4 17.5 +#WER on eval2000(fg) 15.7 15.7 +#Final train prob -0.11156 -0.111636 +#Final valid prob -0.131797 -0.128892 + +# _5v is as _5t, but further reducing the --jesus-hidden-dim from 3500 to 2500. + +# WER is almost the same, perhaps <0.1% worse; diagnostics are slightly worse. +# +#local/chain/compare_wer.sh 5e 5s 5t 5v +#System 5e 5s 5t 5v +#WER on train_dev(tg) 15.43 15.47 15.43 15.38 +#WER on train_dev(fg) 14.32 14.31 14.34 14.39 +#WER on eval2000(tg) 17.3 17.4 17.4 17.4 +#WER on eval2000(fg) 15.5 15.6 15.6 15.7 +#Final train prob -0.110056 -0.110928 -0.110752 -0.11156 +#Final valid prob -0.129184 -0.132139 -0.129123 -0.131797 + +# _5t is as _5s but further reducing the jesus-hidden-dim (trying to speed it +# up), from 5000 to 3500. + +# about 5s: comparing with 5e which is the most recent baseline we actually +# decoded, 5s is as 5e but with jesus-forward-output-dim reduced 1800->1700, +# jesus-hidden-dim reduced 7500 to 5000, and and the new option +# --self-repair-scale 0.00001 added. Also compare 5t and 5v which have even +# smaller jesus-hidden-dims. + +# _5s is as _5r but increasing the jesus-forward-output-dim to the intermediate +# value of 1700 (between 1500 and 1800), and also a bug-fix in the self-repair +# code to a bug which was doubling the thresholds so there was, in effect, +# no upper threshold. I stopped the p,q,r runs after I found this, but in +# configuring this run I'm bearing in mind the train and valid probs from the +# p,q,r runs. + +# _5r is as _5q but also reducing --jesus-hidden-dim from 7500 to 5000. + +# _5q is as _5p but reducing jesus-forward-output-dim from 1800 to 1500 to try +# to compensate for the fact that more of the output dimensions are now being +# usefully used. + +# _5p is as _5e but adding (new option) --self-repair-scale 0.00001, to repair +# ReLUs that are over or under-saturated. + +# _5e is as _5b, but reducing --xent-regularize from 0.2 to 0.1 (since based on +# the results of 4v, 4w and 5c, it looks like 0.1 is better than 0.2 or 0.05). + +# The improvement is small but consistent (0.1, 0.1, 0.0, 0.1) and also seen +# in the train and valid probs. +#System 5b 5e +#WER on train_dev(tg) 15.51 15.43 +#WER on train_dev(fg) 14.39 14.32 +#WER on eval2000(tg) 17.3 17.3 +#WER on eval2000(fg) 15.6 15.5 +#Final train prob -0.112013 -0.110056 +#Final valid prob -0.130879 -0.129184 + +# _5b is as _5a, but adding --leaky-hmm-coefficient 0.1. + +# It does seem helpful on average: (-0.35, -0.35, -0.1, 0). +#./compare_wer.sh 5a 5b +#System 5a 5b +#WER on train_dev(tg) 15.86 15.51 +#WER on train_dev(fg) 14.74 14.39 +#WER on eval2000(tg) 17.4 17.3 +#WER on eval2000(fg) 15.6 15.6 +#Final train prob -0.0998359 -0.112013 +#Final valid prob -0.115884 -0.130879 + +# _5a is as _4w, but increasing jesus-forward-output-dim from 1400 to 1800, and +# jesus-forward-input-dim from 400 to 500. Hoping that the cross-entropy regularization +# will mean that the increased parameters are now helpful. + +# _4w is as _4v, but doubling --xent-regularize to 0.2 + +# _4v is as _4r, but with --xent-regularize 0.1. Increasing max_param_change +# from 1.0 to 2.0 because there is a lot of parameter change in the final xent +# layer, and this limits the rate of change of the other layers. + +# _4r is as _4f, but one more hidden layer, and reducing context of existing +# layers so we can re-use the egs. Reducing jesus-forward-output-dim slightly +# from 1500 to 1400. + +# This is better than 4f by almost all metrics. +# ./compare_wer.sh 4f 4r +# System 4f 4r +# WER on train_dev(tg) 16.83 16.50 +# WER on train_dev(fg) 15.73 15.45 +# WER on eval2000(tg) 18.4 18.3 +# WER on eval2000(fg) 16.6 16.7 +# Final train prob -0.105832 -0.103652 +# Final valid prob -0.123021 -0.121105 + +# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005. + +# It's even better than 4e, by about 0.3% abs. +# 4c 4e 4f +# Final valid prob: -0.1241 -0.1267 -0.1230 +# Final train prob: -0.08820 -0.1149 -0.1058 + +# ./show_wer.sh 4f +# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0 +# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 4e +# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0 +# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0 +# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys + + +# _4e is as _4c, but adding the option --l2-regularize 0.0001. + +# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000. + +# _4a is as _3s, but using narrower splice-indexes in the first layer. + +# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400. +# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p. +# This of course reduces overtraining. Results are a bit better than 3p but still +# not as good as 2y + +# ./show_wer.sh 3s +# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 3p +# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0 +# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0 +# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys +# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 2y +# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + + +# _3r is as _3p but reducing the number of parameters as it seemed to be +# overtraining (despite already being quite a small model): [600,1800 -> +# 500,1500]. Also in the interim there was a script change to +# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change. +# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent +# with the halving of the minibatch size.] + + +# _3p is the same as 3o, but after a code and script change so we can use +# natural gradient for the RepeatedAffineComponent. +# [natural gradient was helpful, based on logs; +# also made a change to use positive bias for the jesus-component affine parts.] + +# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2. + +# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support +# recurrence, with improvements to the learning of the jesus layers. + +# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found +# to be worse. +# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence +# is helpful.] +#./show_wer.sh 3g +#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys +#a03:s5c: ./show_wer.sh 2y +#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +#a03:s5c: ./show_wer.sh 3d +#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0 +#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +# _3f is as _3e, but modifying the splicing setup to add (left) recurrence: +# added the :3's in --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" +# Therefore it's +# no longer really a tdnn, more like an RNN combined with TDNN. BTW, I'm not re-dumping egs with extra +# context, and this isn't really ideal - I want to see if this seems promising first. + +# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default) +# to 200 in order to reduce computation in the Jesus layer. + +# _3d is as _2y, and re-using the egs, but using --jesus-opts and +# configs from make_jesus_configs.py. +# --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ +# --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" + +# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from +# 800k to 1.2 million. The aim is to avoid some of the per-job overhead +# (model-averaging, etc.), since each iteration takes only a minute or so. +# I added the results to the table below. It seems the same on average- +# which is good. We'll probably keep this configuration. + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# note: when I compare with the rerun of 2o (not shown), this run is actually +# better. +# WER on 2m 2o 2y [ now comparing 2o->2y:] +# train_dev,tg 17.22 17.24 16.99 0.2% better +# train_dev,fg 15.87 15.93 15.86 0.1% better +# eval2000,tg 18.7 18.7 18.9 0.2% worse +# eval2000,fg 17.0 16.9 17.0 0.1% worse + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_6a # Note: _sp will get added to this if $speed_perturb == true. + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=2.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --xent-regularize 0.1 \ + --leaky-hmm-coefficient 0.1 \ + --l2-regularize 0.00005 \ + --egs-dir exp/chain/tdnn_2y_sp/egs \ + --jesus-opts "--jesus-forward-input-dim 600 --jesus-forward-output-dim 1800 --final-hidden-dim 500 --jesus-hidden-dim 3000 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25 --self-repair-scale 0.00001" \ + --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3 -3,0,3 -6,-3,0" \ + --apply-deriv-weights false \ + --frames-per-iter 1200000 \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context 20 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_6b.sh b/egs/swbd/s5c/local/chain/run_tdnn_6b.sh new file mode 100755 index 00000000000..5cd3f7dfbf2 --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_6b.sh @@ -0,0 +1,480 @@ +#!/bin/bash + +# _6b is as _5y, where we keep the hidden parts of the network a bit larger +# but take the final-hidden-dim back up to 500, which is the same as what +# it was in 5v. + +# _5y is as _5v, but rebalancing the network to have fewer parameters in the +# final layer and more in the hidden parts, by reducing --final-hidden-dim from 500 +# (it defaults to --jesus-forward-hidden-dim) to 400, and increasing +# --jesus-forward-input-dim from 500 to 600 and +# --jesus-forward-output-dim from 1700 to 1800, +# and --jesus-hidden-dim from 2500 to 3000 (note: I don't really expect this last change +# to make much of a difference). +# Very roughly, we're moving about a million parameters from the final layer to the +# hidden parts of the network. Hopefully this will reduce overtraining, since +# the hidden parts of the network are regularized by the --xent-regularize option. + +# The diagnostics were improved, but the WER is no better (or maybe slightly worse). +#local/chain/compare_wer.sh 5v 5y +#System 5v 5y +#WER on train_dev(tg) 15.38 15.50 +#WER on train_dev(fg) 14.39 14.37 +#WER on eval2000(tg) 17.4 17.5 +#WER on eval2000(fg) 15.7 15.7 +#Final train prob -0.11156 -0.111636 +#Final valid prob -0.131797 -0.128892 + +# _5v is as _5t, but further reducing the --jesus-hidden-dim from 3500 to 2500. + +# WER is almost the same, perhaps <0.1% worse; diagnostics are slightly worse. +# +#local/chain/compare_wer.sh 5e 5s 5t 5v +#System 5e 5s 5t 5v +#WER on train_dev(tg) 15.43 15.47 15.43 15.38 +#WER on train_dev(fg) 14.32 14.31 14.34 14.39 +#WER on eval2000(tg) 17.3 17.4 17.4 17.4 +#WER on eval2000(fg) 15.5 15.6 15.6 15.7 +#Final train prob -0.110056 -0.110928 -0.110752 -0.11156 +#Final valid prob -0.129184 -0.132139 -0.129123 -0.131797 + +# _5t is as _5s but further reducing the jesus-hidden-dim (trying to speed it +# up), from 5000 to 3500. + +# about 5s: comparing with 5e which is the most recent baseline we actually +# decoded, 5s is as 5e but with jesus-forward-output-dim reduced 1800->1700, +# jesus-hidden-dim reduced 7500 to 5000, and and the new option +# --self-repair-scale 0.00001 added. Also compare 5t and 5v which have even +# smaller jesus-hidden-dims. + +# _5s is as _5r but increasing the jesus-forward-output-dim to the intermediate +# value of 1700 (between 1500 and 1800), and also a bug-fix in the self-repair +# code to a bug which was doubling the thresholds so there was, in effect, +# no upper threshold. I stopped the p,q,r runs after I found this, but in +# configuring this run I'm bearing in mind the train and valid probs from the +# p,q,r runs. + +# _5r is as _5q but also reducing --jesus-hidden-dim from 7500 to 5000. + +# _5q is as _5p but reducing jesus-forward-output-dim from 1800 to 1500 to try +# to compensate for the fact that more of the output dimensions are now being +# usefully used. + +# _5p is as _5e but adding (new option) --self-repair-scale 0.00001, to repair +# ReLUs that are over or under-saturated. + +# _5e is as _5b, but reducing --xent-regularize from 0.2 to 0.1 (since based on +# the results of 4v, 4w and 5c, it looks like 0.1 is better than 0.2 or 0.05). + +# The improvement is small but consistent (0.1, 0.1, 0.0, 0.1) and also seen +# in the train and valid probs. +#System 5b 5e +#WER on train_dev(tg) 15.51 15.43 +#WER on train_dev(fg) 14.39 14.32 +#WER on eval2000(tg) 17.3 17.3 +#WER on eval2000(fg) 15.6 15.5 +#Final train prob -0.112013 -0.110056 +#Final valid prob -0.130879 -0.129184 + +# _5b is as _5a, but adding --leaky-hmm-coefficient 0.1. + +# It does seem helpful on average: (-0.35, -0.35, -0.1, 0). +#./compare_wer.sh 5a 5b +#System 5a 5b +#WER on train_dev(tg) 15.86 15.51 +#WER on train_dev(fg) 14.74 14.39 +#WER on eval2000(tg) 17.4 17.3 +#WER on eval2000(fg) 15.6 15.6 +#Final train prob -0.0998359 -0.112013 +#Final valid prob -0.115884 -0.130879 + +# _5a is as _4w, but increasing jesus-forward-output-dim from 1400 to 1800, and +# jesus-forward-input-dim from 400 to 500. Hoping that the cross-entropy regularization +# will mean that the increased parameters are now helpful. + +# _4w is as _4v, but doubling --xent-regularize to 0.2 + +# _4v is as _4r, but with --xent-regularize 0.1. Increasing max_param_change +# from 1.0 to 2.0 because there is a lot of parameter change in the final xent +# layer, and this limits the rate of change of the other layers. + +# _4r is as _4f, but one more hidden layer, and reducing context of existing +# layers so we can re-use the egs. Reducing jesus-forward-output-dim slightly +# from 1500 to 1400. + +# This is better than 4f by almost all metrics. +# ./compare_wer.sh 4f 4r +# System 4f 4r +# WER on train_dev(tg) 16.83 16.50 +# WER on train_dev(fg) 15.73 15.45 +# WER on eval2000(tg) 18.4 18.3 +# WER on eval2000(fg) 16.6 16.7 +# Final train prob -0.105832 -0.103652 +# Final valid prob -0.123021 -0.121105 + +# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005. + +# It's even better than 4e, by about 0.3% abs. +# 4c 4e 4f +# Final valid prob: -0.1241 -0.1267 -0.1230 +# Final train prob: -0.08820 -0.1149 -0.1058 + +# ./show_wer.sh 4f +# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0 +# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 4e +# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0 +# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0 +# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys + + +# _4e is as _4c, but adding the option --l2-regularize 0.0001. + +# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000. + +# _4a is as _3s, but using narrower splice-indexes in the first layer. + +# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400. +# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p. +# This of course reduces overtraining. Results are a bit better than 3p but still +# not as good as 2y + +# ./show_wer.sh 3s +# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 3p +# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0 +# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0 +# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys +# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 2y +# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + + +# _3r is as _3p but reducing the number of parameters as it seemed to be +# overtraining (despite already being quite a small model): [600,1800 -> +# 500,1500]. Also in the interim there was a script change to +# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change. +# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent +# with the halving of the minibatch size.] + + +# _3p is the same as 3o, but after a code and script change so we can use +# natural gradient for the RepeatedAffineComponent. +# [natural gradient was helpful, based on logs; +# also made a change to use positive bias for the jesus-component affine parts.] + +# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2. + +# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support +# recurrence, with improvements to the learning of the jesus layers. + +# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found +# to be worse. +# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence +# is helpful.] +#./show_wer.sh 3g +#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys +#a03:s5c: ./show_wer.sh 2y +#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +#a03:s5c: ./show_wer.sh 3d +#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0 +#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +# _3f is as _3e, but modifying the splicing setup to add (left) recurrence: +# added the :3's in --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" +# Therefore it's +# no longer really a tdnn, more like an RNN combined with TDNN. BTW, I'm not re-dumping egs with extra +# context, and this isn't really ideal - I want to see if this seems promising first. + +# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default) +# to 200 in order to reduce computation in the Jesus layer. + +# _3d is as _2y, and re-using the egs, but using --jesus-opts and +# configs from make_jesus_configs.py. +# --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ +# --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" + +# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from +# 800k to 1.2 million. The aim is to avoid some of the per-job overhead +# (model-averaging, etc.), since each iteration takes only a minute or so. +# I added the results to the table below. It seems the same on average- +# which is good. We'll probably keep this configuration. + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# note: when I compare with the rerun of 2o (not shown), this run is actually +# better. +# WER on 2m 2o 2y [ now comparing 2o->2y:] +# train_dev,tg 17.22 17.24 16.99 0.2% better +# train_dev,fg 15.87 15.93 15.86 0.1% better +# eval2000,tg 18.7 18.7 18.9 0.2% worse +# eval2000,fg 17.0 16.9 17.0 0.1% worse + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_6b # Note: _sp will get added to this if $speed_perturb == true. + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=2.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --xent-regularize 0.1 \ + --leaky-hmm-coefficient 0.1 \ + --l2-regularize 0.00005 \ + --egs-dir exp/chain/tdnn_2y_sp/egs \ + --jesus-opts "--jesus-forward-input-dim 600 --jesus-forward-output-dim 1800 --final-hidden-dim 500 --jesus-hidden-dim 3000 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25 --self-repair-scale 0.00001" \ + --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3 -3,0,3 -6,-3,0" \ + --apply-deriv-weights false \ + --frames-per-iter 1200000 \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context 20 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_6c.sh b/egs/swbd/s5c/local/chain/run_tdnn_6c.sh new file mode 100755 index 00000000000..7334a5e185e --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_6c.sh @@ -0,0 +1,468 @@ +#!/bin/bash + +# _6c is as _5v but adding "--thick-jesus-layer true" (new option): extra hidden +# layer inside jesus layer. + +# Note: 5v2 is a rerun of 5v. +#local/chain/compare_wer.sh 5v 5v2 6c +#System 5v 5v2 6c +#WER on train_dev(tg) 15.38 15.74 15.54 +#WER on train_dev(fg) 14.39 14.50 14.55 +#WER on eval2000(tg) 17.4 17.5 17.5 +#WER on eval2000(fg) 15.7 15.9 15.8 +#Final train prob -0.11156 -0.112155 -0.114084 +#Final valid prob -0.131797 -0.129516 -0.129589 + +# _5v is as _5t, but further reducing the --jesus-hidden-dim from 3500 to 2500. + +# WER is almost the same, perhaps <0.1% worse; diagnostics are slightly worse. +# +#local/chain/compare_wer.sh 5e 5s 5t 5v +#System 5e 5s 5t 5v +#WER on train_dev(tg) 15.43 15.47 15.43 15.38 +#WER on train_dev(fg) 14.32 14.31 14.34 14.39 +#WER on eval2000(tg) 17.3 17.4 17.4 17.4 +#WER on eval2000(fg) 15.5 15.6 15.6 15.7 +#Final train prob -0.110056 -0.110928 -0.110752 -0.11156 +#Final valid prob -0.129184 -0.132139 -0.129123 -0.131797 + +# _5t is as _5s but further reducing the jesus-hidden-dim (trying to speed it +# up), from 5000 to 3500. + +# about 5s: comparing with 5e which is the most recent baseline we actually +# decoded, 5s is as 5e but with jesus-forward-output-dim reduced 1800->1700, +# jesus-hidden-dim reduced 7500 to 5000, and and the new option +# --self-repair-scale 0.00001 added. Also compare 5t and 5v which have even +# smaller jesus-hidden-dims. + +# _5s is as _5r but increasing the jesus-forward-output-dim to the intermediate +# value of 1700 (between 1500 and 1800), and also a bug-fix in the self-repair +# code to a bug which was doubling the thresholds so there was, in effect, +# no upper threshold. I stopped the p,q,r runs after I found this, but in +# configuring this run I'm bearing in mind the train and valid probs from the +# p,q,r runs. + +# _5r is as _5q but also reducing --jesus-hidden-dim from 7500 to 5000. + +# _5q is as _5p but reducing jesus-forward-output-dim from 1800 to 1500 to try +# to compensate for the fact that more of the output dimensions are now being +# usefully used. + +# _5p is as _5e but adding (new option) --self-repair-scale 0.00001, to repair +# ReLUs that are over or under-saturated. + +# _5e is as _5b, but reducing --xent-regularize from 0.2 to 0.1 (since based on +# the results of 4v, 4w and 5c, it looks like 0.1 is better than 0.2 or 0.05). + +# The improvement is small but consistent (0.1, 0.1, 0.0, 0.1) and also seen +# in the train and valid probs. +#System 5b 5e +#WER on train_dev(tg) 15.51 15.43 +#WER on train_dev(fg) 14.39 14.32 +#WER on eval2000(tg) 17.3 17.3 +#WER on eval2000(fg) 15.6 15.5 +#Final train prob -0.112013 -0.110056 +#Final valid prob -0.130879 -0.129184 + +# _5b is as _5a, but adding --leaky-hmm-coefficient 0.1. + +# It does seem helpful on average: (-0.35, -0.35, -0.1, 0). +#./compare_wer.sh 5a 5b +#System 5a 5b +#WER on train_dev(tg) 15.86 15.51 +#WER on train_dev(fg) 14.74 14.39 +#WER on eval2000(tg) 17.4 17.3 +#WER on eval2000(fg) 15.6 15.6 +#Final train prob -0.0998359 -0.112013 +#Final valid prob -0.115884 -0.130879 + +# _5a is as _4w, but increasing jesus-forward-output-dim from 1400 to 1800, and +# jesus-forward-input-dim from 400 to 500. Hoping that the cross-entropy regularization +# will mean that the increased parameters are now helpful. + +# _4w is as _4v, but doubling --xent-regularize to 0.2 + +# _4v is as _4r, but with --xent-regularize 0.1. Increasing max_param_change +# from 1.0 to 2.0 because there is a lot of parameter change in the final xent +# layer, and this limits the rate of change of the other layers. + +# _4r is as _4f, but one more hidden layer, and reducing context of existing +# layers so we can re-use the egs. Reducing jesus-forward-output-dim slightly +# from 1500 to 1400. + +# This is better than 4f by almost all metrics. +# ./compare_wer.sh 4f 4r +# System 4f 4r +# WER on train_dev(tg) 16.83 16.50 +# WER on train_dev(fg) 15.73 15.45 +# WER on eval2000(tg) 18.4 18.3 +# WER on eval2000(fg) 16.6 16.7 +# Final train prob -0.105832 -0.103652 +# Final valid prob -0.123021 -0.121105 + +# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005. + +# It's even better than 4e, by about 0.3% abs. +# 4c 4e 4f +# Final valid prob: -0.1241 -0.1267 -0.1230 +# Final train prob: -0.08820 -0.1149 -0.1058 + +# ./show_wer.sh 4f +# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0 +# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 4e +# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0 +# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0 +# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys + + +# _4e is as _4c, but adding the option --l2-regularize 0.0001. + +# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000. + +# _4a is as _3s, but using narrower splice-indexes in the first layer. + +# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400. +# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p. +# This of course reduces overtraining. Results are a bit better than 3p but still +# not as good as 2y + +# ./show_wer.sh 3s +# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 3p +# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0 +# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0 +# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys +# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 2y +# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + + +# _3r is as _3p but reducing the number of parameters as it seemed to be +# overtraining (despite already being quite a small model): [600,1800 -> +# 500,1500]. Also in the interim there was a script change to +# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change. +# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent +# with the halving of the minibatch size.] + + +# _3p is the same as 3o, but after a code and script change so we can use +# natural gradient for the RepeatedAffineComponent. +# [natural gradient was helpful, based on logs; +# also made a change to use positive bias for the jesus-component affine parts.] + +# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2. + +# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support +# recurrence, with improvements to the learning of the jesus layers. + +# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found +# to be worse. +# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence +# is helpful.] +#./show_wer.sh 3g +#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys +#a03:s5c: ./show_wer.sh 2y +#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +#a03:s5c: ./show_wer.sh 3d +#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0 +#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +# _3f is as _3e, but modifying the splicing setup to add (left) recurrence: +# added the :3's in --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" +# Therefore it's +# no longer really a tdnn, more like an RNN combined with TDNN. BTW, I'm not re-dumping egs with extra +# context, and this isn't really ideal - I want to see if this seems promising first. + +# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default) +# to 200 in order to reduce computation in the Jesus layer. + +# _3d is as _2y, and re-using the egs, but using --jesus-opts and +# configs from make_jesus_configs.py. +# --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ +# --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" + +# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from +# 800k to 1.2 million. The aim is to avoid some of the per-job overhead +# (model-averaging, etc.), since each iteration takes only a minute or so. +# I added the results to the table below. It seems the same on average- +# which is good. We'll probably keep this configuration. + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# note: when I compare with the rerun of 2o (not shown), this run is actually +# better. +# WER on 2m 2o 2y [ now comparing 2o->2y:] +# train_dev,tg 17.22 17.24 16.99 0.2% better +# train_dev,fg 15.87 15.93 15.86 0.1% better +# eval2000,tg 18.7 18.7 18.9 0.2% worse +# eval2000,fg 17.0 16.9 17.0 0.1% worse + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_6c # Note: _sp will get added to this if $speed_perturb == true. + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=2.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --xent-regularize 0.1 \ + --leaky-hmm-coefficient 0.1 \ + --l2-regularize 0.00005 \ + --egs-dir exp/chain/tdnn_2y_sp/egs \ + --jesus-opts "--jesus-forward-input-dim 500 --jesus-forward-output-dim 1700 --jesus-hidden-dim 2500 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25 --self-repair-scale 0.00001 --thick-jesus-layer true" \ + --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3 -3,0,3 -6,-3,0" \ + --apply-deriv-weights false \ + --frames-per-iter 1200000 \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context 20 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_6d.sh b/egs/swbd/s5c/local/chain/run_tdnn_6d.sh new file mode 100755 index 00000000000..80b6a18cabf --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_6d.sh @@ -0,0 +1,470 @@ +#!/bin/bash + +# _6d is as _5v but changing adding --num-jesus-blocks 84 (default is 100). +# this means (after rounding) that we have 6, not 5, as +# --jesus-forward-input-dim / --num-jesus-blocks. + +# no clear difference. +#[note, 5v2 is a rerun of 5v]. +# local/chain/compare_wer.sh 5v 5v2 6d +# System 5v 5v2 6d +# WER on train_dev(tg) 15.38 15.74 15.66 +# WER on train_dev(fg) 14.39 14.50 14.54 +# WER on eval2000(tg) 17.4 17.5 17.5 +# WER on eval2000(fg) 15.7 15.9 15.8 +# Final train prob -0.11156 -0.112155 -0.112034 +# Final valid prob -0.131797 -0.129516 -0.131714 + +# _5v is as _5t, but further reducing the --jesus-hidden-dim from 3500 to 2500. + +# WER is almost the same, perhaps <0.1% worse; diagnostics are slightly worse. +# +#local/chain/compare_wer.sh 5e 5s 5t 5v +#System 5e 5s 5t 5v +#WER on train_dev(tg) 15.43 15.47 15.43 15.38 +#WER on train_dev(fg) 14.32 14.31 14.34 14.39 +#WER on eval2000(tg) 17.3 17.4 17.4 17.4 +#WER on eval2000(fg) 15.5 15.6 15.6 15.7 +#Final train prob -0.110056 -0.110928 -0.110752 -0.11156 +#Final valid prob -0.129184 -0.132139 -0.129123 -0.131797 + +# _5t is as _5s but further reducing the jesus-hidden-dim (trying to speed it +# up), from 5000 to 3500. + +# about 5s: comparing with 5e which is the most recent baseline we actually +# decoded, 5s is as 5e but with jesus-forward-output-dim reduced 1800->1700, +# jesus-hidden-dim reduced 7500 to 5000, and and the new option +# --self-repair-scale 0.00001 added. Also compare 5t and 5v which have even +# smaller jesus-hidden-dims. + +# _5s is as _5r but increasing the jesus-forward-output-dim to the intermediate +# value of 1700 (between 1500 and 1800), and also a bug-fix in the self-repair +# code to a bug which was doubling the thresholds so there was, in effect, +# no upper threshold. I stopped the p,q,r runs after I found this, but in +# configuring this run I'm bearing in mind the train and valid probs from the +# p,q,r runs. + +# _5r is as _5q but also reducing --jesus-hidden-dim from 7500 to 5000. + +# _5q is as _5p but reducing jesus-forward-output-dim from 1800 to 1500 to try +# to compensate for the fact that more of the output dimensions are now being +# usefully used. + +# _5p is as _5e but adding (new option) --self-repair-scale 0.00001, to repair +# ReLUs that are over or under-saturated. + +# _5e is as _5b, but reducing --xent-regularize from 0.2 to 0.1 (since based on +# the results of 4v, 4w and 5c, it looks like 0.1 is better than 0.2 or 0.05). + +# The improvement is small but consistent (0.1, 0.1, 0.0, 0.1) and also seen +# in the train and valid probs. +#System 5b 5e +#WER on train_dev(tg) 15.51 15.43 +#WER on train_dev(fg) 14.39 14.32 +#WER on eval2000(tg) 17.3 17.3 +#WER on eval2000(fg) 15.6 15.5 +#Final train prob -0.112013 -0.110056 +#Final valid prob -0.130879 -0.129184 + +# _5b is as _5a, but adding --leaky-hmm-coefficient 0.1. + +# It does seem helpful on average: (-0.35, -0.35, -0.1, 0). +#./compare_wer.sh 5a 5b +#System 5a 5b +#WER on train_dev(tg) 15.86 15.51 +#WER on train_dev(fg) 14.74 14.39 +#WER on eval2000(tg) 17.4 17.3 +#WER on eval2000(fg) 15.6 15.6 +#Final train prob -0.0998359 -0.112013 +#Final valid prob -0.115884 -0.130879 + +# _5a is as _4w, but increasing jesus-forward-output-dim from 1400 to 1800, and +# jesus-forward-input-dim from 400 to 500. Hoping that the cross-entropy regularization +# will mean that the increased parameters are now helpful. + +# _4w is as _4v, but doubling --xent-regularize to 0.2 + +# _4v is as _4r, but with --xent-regularize 0.1. Increasing max_param_change +# from 1.0 to 2.0 because there is a lot of parameter change in the final xent +# layer, and this limits the rate of change of the other layers. + +# _4r is as _4f, but one more hidden layer, and reducing context of existing +# layers so we can re-use the egs. Reducing jesus-forward-output-dim slightly +# from 1500 to 1400. + +# This is better than 4f by almost all metrics. +# ./compare_wer.sh 4f 4r +# System 4f 4r +# WER on train_dev(tg) 16.83 16.50 +# WER on train_dev(fg) 15.73 15.45 +# WER on eval2000(tg) 18.4 18.3 +# WER on eval2000(fg) 16.6 16.7 +# Final train prob -0.105832 -0.103652 +# Final valid prob -0.123021 -0.121105 + +# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005. + +# It's even better than 4e, by about 0.3% abs. +# 4c 4e 4f +# Final valid prob: -0.1241 -0.1267 -0.1230 +# Final train prob: -0.08820 -0.1149 -0.1058 + +# ./show_wer.sh 4f +# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0 +# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 4e +# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0 +# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0 +# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys + + +# _4e is as _4c, but adding the option --l2-regularize 0.0001. + +# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000. + +# _4a is as _3s, but using narrower splice-indexes in the first layer. + +# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400. +# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p. +# This of course reduces overtraining. Results are a bit better than 3p but still +# not as good as 2y + +# ./show_wer.sh 3s +# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 3p +# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0 +# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0 +# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys +# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 2y +# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + + +# _3r is as _3p but reducing the number of parameters as it seemed to be +# overtraining (despite already being quite a small model): [600,1800 -> +# 500,1500]. Also in the interim there was a script change to +# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change. +# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent +# with the halving of the minibatch size.] + + +# _3p is the same as 3o, but after a code and script change so we can use +# natural gradient for the RepeatedAffineComponent. +# [natural gradient was helpful, based on logs; +# also made a change to use positive bias for the jesus-component affine parts.] + +# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2. + +# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support +# recurrence, with improvements to the learning of the jesus layers. + +# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found +# to be worse. +# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence +# is helpful.] +#./show_wer.sh 3g +#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys +#a03:s5c: ./show_wer.sh 2y +#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +#a03:s5c: ./show_wer.sh 3d +#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0 +#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +# _3f is as _3e, but modifying the splicing setup to add (left) recurrence: +# added the :3's in --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" +# Therefore it's +# no longer really a tdnn, more like an RNN combined with TDNN. BTW, I'm not re-dumping egs with extra +# context, and this isn't really ideal - I want to see if this seems promising first. + +# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default) +# to 200 in order to reduce computation in the Jesus layer. + +# _3d is as _2y, and re-using the egs, but using --jesus-opts and +# configs from make_jesus_configs.py. +# --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ +# --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" + +# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from +# 800k to 1.2 million. The aim is to avoid some of the per-job overhead +# (model-averaging, etc.), since each iteration takes only a minute or so. +# I added the results to the table below. It seems the same on average- +# which is good. We'll probably keep this configuration. + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# note: when I compare with the rerun of 2o (not shown), this run is actually +# better. +# WER on 2m 2o 2y [ now comparing 2o->2y:] +# train_dev,tg 17.22 17.24 16.99 0.2% better +# train_dev,fg 15.87 15.93 15.86 0.1% better +# eval2000,tg 18.7 18.7 18.9 0.2% worse +# eval2000,fg 17.0 16.9 17.0 0.1% worse + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_6d # Note: _sp will get added to this if $speed_perturb == true. + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=2.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --xent-regularize 0.1 \ + --leaky-hmm-coefficient 0.1 \ + --l2-regularize 0.00005 \ + --egs-dir exp/chain/tdnn_2y_sp/egs \ + --jesus-opts "--num-jesus-blocks 84 --jesus-forward-input-dim 500 --jesus-forward-output-dim 1700 --jesus-hidden-dim 2500 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25 --self-repair-scale 0.00001" \ + --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3 -3,0,3 -6,-3,0" \ + --apply-deriv-weights false \ + --frames-per-iter 1200000 \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context 20 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_6e.sh b/egs/swbd/s5c/local/chain/run_tdnn_6e.sh new file mode 100755 index 00000000000..d44973db7ba --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_6e.sh @@ -0,0 +1,464 @@ +#!/bin/bash + + +# _6e is as _6d but going further: reducing --num-jesus-blocks to 72 = ceil(500/7). + +# +# _6d is as _5v but changing adding --num-jesus-blocks 84 (default is 100). +# this means (after rounding) that we have 6, not 5, as +# --jesus-forward-input-dim / --num-jesus-blocks. + + +# _5v is as _5t, but further reducing the --jesus-hidden-dim from 3500 to 2500. + +# WER is almost the same, perhaps <0.1% worse; diagnostics are slightly worse. +# +#local/chain/compare_wer.sh 5e 5s 5t 5v +#System 5e 5s 5t 5v +#WER on train_dev(tg) 15.43 15.47 15.43 15.38 +#WER on train_dev(fg) 14.32 14.31 14.34 14.39 +#WER on eval2000(tg) 17.3 17.4 17.4 17.4 +#WER on eval2000(fg) 15.5 15.6 15.6 15.7 +#Final train prob -0.110056 -0.110928 -0.110752 -0.11156 +#Final valid prob -0.129184 -0.132139 -0.129123 -0.131797 + +# _5t is as _5s but further reducing the jesus-hidden-dim (trying to speed it +# up), from 5000 to 3500. + +# about 5s: comparing with 5e which is the most recent baseline we actually +# decoded, 5s is as 5e but with jesus-forward-output-dim reduced 1800->1700, +# jesus-hidden-dim reduced 7500 to 5000, and and the new option +# --self-repair-scale 0.00001 added. Also compare 5t and 5v which have even +# smaller jesus-hidden-dims. + +# _5s is as _5r but increasing the jesus-forward-output-dim to the intermediate +# value of 1700 (between 1500 and 1800), and also a bug-fix in the self-repair +# code to a bug which was doubling the thresholds so there was, in effect, +# no upper threshold. I stopped the p,q,r runs after I found this, but in +# configuring this run I'm bearing in mind the train and valid probs from the +# p,q,r runs. + +# _5r is as _5q but also reducing --jesus-hidden-dim from 7500 to 5000. + +# _5q is as _5p but reducing jesus-forward-output-dim from 1800 to 1500 to try +# to compensate for the fact that more of the output dimensions are now being +# usefully used. + +# _5p is as _5e but adding (new option) --self-repair-scale 0.00001, to repair +# ReLUs that are over or under-saturated. + +# _5e is as _5b, but reducing --xent-regularize from 0.2 to 0.1 (since based on +# the results of 4v, 4w and 5c, it looks like 0.1 is better than 0.2 or 0.05). + +# The improvement is small but consistent (0.1, 0.1, 0.0, 0.1) and also seen +# in the train and valid probs. +#System 5b 5e +#WER on train_dev(tg) 15.51 15.43 +#WER on train_dev(fg) 14.39 14.32 +#WER on eval2000(tg) 17.3 17.3 +#WER on eval2000(fg) 15.6 15.5 +#Final train prob -0.112013 -0.110056 +#Final valid prob -0.130879 -0.129184 + +# _5b is as _5a, but adding --leaky-hmm-coefficient 0.1. + +# It does seem helpful on average: (-0.35, -0.35, -0.1, 0). +#./compare_wer.sh 5a 5b +#System 5a 5b +#WER on train_dev(tg) 15.86 15.51 +#WER on train_dev(fg) 14.74 14.39 +#WER on eval2000(tg) 17.4 17.3 +#WER on eval2000(fg) 15.6 15.6 +#Final train prob -0.0998359 -0.112013 +#Final valid prob -0.115884 -0.130879 + +# _5a is as _4w, but increasing jesus-forward-output-dim from 1400 to 1800, and +# jesus-forward-input-dim from 400 to 500. Hoping that the cross-entropy regularization +# will mean that the increased parameters are now helpful. + +# _4w is as _4v, but doubling --xent-regularize to 0.2 + +# _4v is as _4r, but with --xent-regularize 0.1. Increasing max_param_change +# from 1.0 to 2.0 because there is a lot of parameter change in the final xent +# layer, and this limits the rate of change of the other layers. + +# _4r is as _4f, but one more hidden layer, and reducing context of existing +# layers so we can re-use the egs. Reducing jesus-forward-output-dim slightly +# from 1500 to 1400. + +# This is better than 4f by almost all metrics. +# ./compare_wer.sh 4f 4r +# System 4f 4r +# WER on train_dev(tg) 16.83 16.50 +# WER on train_dev(fg) 15.73 15.45 +# WER on eval2000(tg) 18.4 18.3 +# WER on eval2000(fg) 16.6 16.7 +# Final train prob -0.105832 -0.103652 +# Final valid prob -0.123021 -0.121105 + +# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005. + +# It's even better than 4e, by about 0.3% abs. +# 4c 4e 4f +# Final valid prob: -0.1241 -0.1267 -0.1230 +# Final train prob: -0.08820 -0.1149 -0.1058 + +# ./show_wer.sh 4f +# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0 +# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 4e +# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0 +# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0 +# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys + + +# _4e is as _4c, but adding the option --l2-regularize 0.0001. + +# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000. + +# _4a is as _3s, but using narrower splice-indexes in the first layer. + +# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400. +# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p. +# This of course reduces overtraining. Results are a bit better than 3p but still +# not as good as 2y + +# ./show_wer.sh 3s +# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 3p +# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0 +# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0 +# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys +# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 2y +# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + + +# _3r is as _3p but reducing the number of parameters as it seemed to be +# overtraining (despite already being quite a small model): [600,1800 -> +# 500,1500]. Also in the interim there was a script change to +# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change. +# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent +# with the halving of the minibatch size.] + + +# _3p is the same as 3o, but after a code and script change so we can use +# natural gradient for the RepeatedAffineComponent. +# [natural gradient was helpful, based on logs; +# also made a change to use positive bias for the jesus-component affine parts.] + +# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2. + +# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support +# recurrence, with improvements to the learning of the jesus layers. + +# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found +# to be worse. +# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence +# is helpful.] +#./show_wer.sh 3g +#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys +#a03:s5c: ./show_wer.sh 2y +#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +#a03:s5c: ./show_wer.sh 3d +#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0 +#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +# _3f is as _3e, but modifying the splicing setup to add (left) recurrence: +# added the :3's in --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" +# Therefore it's +# no longer really a tdnn, more like an RNN combined with TDNN. BTW, I'm not re-dumping egs with extra +# context, and this isn't really ideal - I want to see if this seems promising first. + +# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default) +# to 200 in order to reduce computation in the Jesus layer. + +# _3d is as _2y, and re-using the egs, but using --jesus-opts and +# configs from make_jesus_configs.py. +# --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ +# --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" + +# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from +# 800k to 1.2 million. The aim is to avoid some of the per-job overhead +# (model-averaging, etc.), since each iteration takes only a minute or so. +# I added the results to the table below. It seems the same on average- +# which is good. We'll probably keep this configuration. + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# note: when I compare with the rerun of 2o (not shown), this run is actually +# better. +# WER on 2m 2o 2y [ now comparing 2o->2y:] +# train_dev,tg 17.22 17.24 16.99 0.2% better +# train_dev,fg 15.87 15.93 15.86 0.1% better +# eval2000,tg 18.7 18.7 18.9 0.2% worse +# eval2000,fg 17.0 16.9 17.0 0.1% worse + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_6e # Note: _sp will get added to this if $speed_perturb == true. + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=2.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --xent-regularize 0.1 \ + --leaky-hmm-coefficient 0.1 \ + --l2-regularize 0.00005 \ + --egs-dir exp/chain/tdnn_2y_sp/egs \ + --jesus-opts "--num-jesus-blocks 72 --jesus-forward-input-dim 500 --jesus-forward-output-dim 1700 --jesus-hidden-dim 2500 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25 --self-repair-scale 0.00001" \ + --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3 -3,0,3 -6,-3,0" \ + --apply-deriv-weights false \ + --frames-per-iter 1200000 \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context 20 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_6f.sh b/egs/swbd/s5c/local/chain/run_tdnn_6f.sh new file mode 100755 index 00000000000..fb7ff03b66d --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_6f.sh @@ -0,0 +1,470 @@ +#!/bin/bash + +# _6f is as _5v, but setting --jesus-hidden-dim to 0 which with a script change +# means there is no hidden part in the jesus layer (it's just repeated affine and relu). + +# slightly worse, but encouragingly small difference. +# note, 5v2 is a rerun of 5v. +# local/chain/compare_wer.sh 5v 5v2 6f +# System 5v 5v2 6f +# WER on train_dev(tg) 15.38 15.74 15.71 +# WER on train_dev(fg) 14.39 14.50 14.50 +# WER on eval2000(tg) 17.4 17.5 17.5 +# WER on eval2000(fg) 15.7 15.9 15.9 +# Final train prob -0.11156 -0.112155 -0.111305 +# Final valid prob -0.131797 -0.129516 -0.131487 + + +# _5v is as _5t, but further reducing the --jesus-hidden-dim from 3500 to 2500. + +# WER is almost the same, perhaps <0.1% worse; diagnostics are slightly worse. +# +#local/chain/compare_wer.sh 5e 5s 5t 5v +#System 5e 5s 5t 5v +#WER on train_dev(tg) 15.43 15.47 15.43 15.38 +#WER on train_dev(fg) 14.32 14.31 14.34 14.39 +#WER on eval2000(tg) 17.3 17.4 17.4 17.4 +#WER on eval2000(fg) 15.5 15.6 15.6 15.7 +#Final train prob -0.110056 -0.110928 -0.110752 -0.11156 +#Final valid prob -0.129184 -0.132139 -0.129123 -0.131797 + +# _5t is as _5s but further reducing the jesus-hidden-dim (trying to speed it +# up), from 5000 to 3500. + +# about 5s: comparing with 5e which is the most recent baseline we actually +# decoded, 5s is as 5e but with jesus-forward-output-dim reduced 1800->1700, +# jesus-hidden-dim reduced 7500 to 5000, and and the new option +# --self-repair-scale 0.00001 added. Also compare 5t and 5v which have even +# smaller jesus-hidden-dims. + +# _5s is as _5r but increasing the jesus-forward-output-dim to the intermediate +# value of 1700 (between 1500 and 1800), and also a bug-fix in the self-repair +# code to a bug which was doubling the thresholds so there was, in effect, +# no upper threshold. I stopped the p,q,r runs after I found this, but in +# configuring this run I'm bearing in mind the train and valid probs from the +# p,q,r runs. + +# _5r is as _5q but also reducing --jesus-hidden-dim from 7500 to 5000. + +# _5q is as _5p but reducing jesus-forward-output-dim from 1800 to 1500 to try +# to compensate for the fact that more of the output dimensions are now being +# usefully used. + +# _5p is as _5e but adding (new option) --self-repair-scale 0.00001, to repair +# ReLUs that are over or under-saturated. + +# _5e is as _5b, but reducing --xent-regularize from 0.2 to 0.1 (since based on +# the results of 4v, 4w and 5c, it looks like 0.1 is better than 0.2 or 0.05). + +# The improvement is small but consistent (0.1, 0.1, 0.0, 0.1) and also seen +# in the train and valid probs. +#System 5b 5e +#WER on train_dev(tg) 15.51 15.43 +#WER on train_dev(fg) 14.39 14.32 +#WER on eval2000(tg) 17.3 17.3 +#WER on eval2000(fg) 15.6 15.5 +#Final train prob -0.112013 -0.110056 +#Final valid prob -0.130879 -0.129184 + +# _5b is as _5a, but adding --leaky-hmm-coefficient 0.1. + +# It does seem helpful on average: (-0.35, -0.35, -0.1, 0). +#./compare_wer.sh 5a 5b +#System 5a 5b +#WER on train_dev(tg) 15.86 15.51 +#WER on train_dev(fg) 14.74 14.39 +#WER on eval2000(tg) 17.4 17.3 +#WER on eval2000(fg) 15.6 15.6 +#Final train prob -0.0998359 -0.112013 +#Final valid prob -0.115884 -0.130879 + +# _5a is as _4w, but increasing jesus-forward-output-dim from 1400 to 1800, and +# jesus-forward-input-dim from 400 to 500. Hoping that the cross-entropy regularization +# will mean that the increased parameters are now helpful. + +# _4w is as _4v, but doubling --xent-regularize to 0.2 + +# _4v is as _4r, but with --xent-regularize 0.1. Increasing max_param_change +# from 1.0 to 2.0 because there is a lot of parameter change in the final xent +# layer, and this limits the rate of change of the other layers. + +# _4r is as _4f, but one more hidden layer, and reducing context of existing +# layers so we can re-use the egs. Reducing jesus-forward-output-dim slightly +# from 1500 to 1400. + +# This is better than 4f by almost all metrics. +# ./compare_wer.sh 4f 4r +# System 4f 4r +# WER on train_dev(tg) 16.83 16.50 +# WER on train_dev(fg) 15.73 15.45 +# WER on eval2000(tg) 18.4 18.3 +# WER on eval2000(fg) 16.6 16.7 +# Final train prob -0.105832 -0.103652 +# Final valid prob -0.123021 -0.121105 + +# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005. + +# It's even better than 4e, by about 0.3% abs. +# 4c 4e 4f +# Final valid prob: -0.1241 -0.1267 -0.1230 +# Final train prob: -0.08820 -0.1149 -0.1058 + +# ./show_wer.sh 4f +# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0 +# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 4e +# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0 +# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0 +# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys + + +# _4e is as _4c, but adding the option --l2-regularize 0.0001. + +# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000. + +# _4a is as _3s, but using narrower splice-indexes in the first layer. + +# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400. +# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p. +# This of course reduces overtraining. Results are a bit better than 3p but still +# not as good as 2y + +# ./show_wer.sh 3s +# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 3p +# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0 +# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0 +# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys +# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 2y +# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + + +# _3r is as _3p but reducing the number of parameters as it seemed to be +# overtraining (despite already being quite a small model): [600,1800 -> +# 500,1500]. Also in the interim there was a script change to +# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change. +# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent +# with the halving of the minibatch size.] + + +# _3p is the same as 3o, but after a code and script change so we can use +# natural gradient for the RepeatedAffineComponent. +# [natural gradient was helpful, based on logs; +# also made a change to use positive bias for the jesus-component affine parts.] + +# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2. + +# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support +# recurrence, with improvements to the learning of the jesus layers. + +# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found +# to be worse. +# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence +# is helpful.] +#./show_wer.sh 3g +#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys +#a03:s5c: ./show_wer.sh 2y +#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +#a03:s5c: ./show_wer.sh 3d +#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0 +#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +# _3f is as _3e, but modifying the splicing setup to add (left) recurrence: +# added the :3's in --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" +# Therefore it's +# no longer really a tdnn, more like an RNN combined with TDNN. BTW, I'm not re-dumping egs with extra +# context, and this isn't really ideal - I want to see if this seems promising first. + +# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default) +# to 200 in order to reduce computation in the Jesus layer. + +# _3d is as _2y, and re-using the egs, but using --jesus-opts and +# configs from make_jesus_configs.py. +# --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ +# --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" + +# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from +# 800k to 1.2 million. The aim is to avoid some of the per-job overhead +# (model-averaging, etc.), since each iteration takes only a minute or so. +# I added the results to the table below. It seems the same on average- +# which is good. We'll probably keep this configuration. + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# note: when I compare with the rerun of 2o (not shown), this run is actually +# better. +# WER on 2m 2o 2y [ now comparing 2o->2y:] +# train_dev,tg 17.22 17.24 16.99 0.2% better +# train_dev,fg 15.87 15.93 15.86 0.1% better +# eval2000,tg 18.7 18.7 18.9 0.2% worse +# eval2000,fg 17.0 16.9 17.0 0.1% worse + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_6f # Note: _sp will get added to this if $speed_perturb == true. + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=2.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --xent-regularize 0.1 \ + --leaky-hmm-coefficient 0.1 \ + --l2-regularize 0.00005 \ + --egs-dir exp/chain/tdnn_2y_sp/egs \ + --jesus-opts "--jesus-forward-input-dim 500 --jesus-forward-output-dim 1700 --jesus-hidden-dim 0 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25 --self-repair-scale 0.00001" \ + --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3 -3,0,3 -6,-3,0" \ + --apply-deriv-weights false \ + --frames-per-iter 1200000 \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context 20 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_6g.sh b/egs/swbd/s5c/local/chain/run_tdnn_6g.sh new file mode 100755 index 00000000000..8d4e8b79fd0 --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_6g.sh @@ -0,0 +1,491 @@ +#!/bin/bash + +# _6g is as _6f but increasing the parameters (increasing +# jesus-forward-input-from from 500 to 600). + +# seems better than 6f, and about the same as (5v,5v2). encouraging. +# note, 5v2 is rerun of 5v. +#local/chain/compare_wer.sh 5v 5v2 6f 6g +#System 5v 5v2 6f 6g +#WER on train_dev(tg) 15.38 15.74 15.71 15.50 +#WER on train_dev(fg) 14.39 14.50 14.50 14.31 +#WER on eval2000(tg) 17.4 17.5 17.5 17.5 +#WER on eval2000(fg) 15.7 15.9 15.9 15.8 +#Final train prob -0.11156 -0.112155 -0.111305 -0.105853 +#Final valid prob -0.131797 -0.129516 -0.131487 -0.129997 + +# _6f is as _5v, but setting --jesus-hidden-dim to 0 which with a script change +# means there is no hidden part in the jesus layer (it's just repeated affine and relu). + +# slightly worse, but encouragingly small difference. +#local/chain/compare_wer.sh 5v 6f +#System 5v 6f +#WER on train_dev(tg) 15.38 15.71 +#WER on train_dev(fg) 14.39 14.50 +#WER on eval2000(tg) 17.4 17.5 +#WER on eval2000(fg) 15.7 15.9 +#Final train prob -0.11156 -0.111305 +#Final valid prob -0.131797 -0.131487 + +# _5v is as _5t, but further reducing the --jesus-hidden-dim from 3500 to 2500. + +# WER is almost the same, perhaps <0.1% worse; diagnostics are slightly worse. +# +#local/chain/compare_wer.sh 5e 5s 5t 5v +#System 5e 5s 5t 5v +#WER on train_dev(tg) 15.43 15.47 15.43 15.38 +#WER on train_dev(fg) 14.32 14.31 14.34 14.39 +#WER on eval2000(tg) 17.3 17.4 17.4 17.4 +#WER on eval2000(fg) 15.5 15.6 15.6 15.7 +#Final train prob -0.110056 -0.110928 -0.110752 -0.11156 +#Final valid prob -0.129184 -0.132139 -0.129123 -0.131797 + +# _5t is as _5s but further reducing the jesus-hidden-dim (trying to speed it +# up), from 5000 to 3500. + +# about 5s: comparing with 5e which is the most recent baseline we actually +# decoded, 5s is as 5e but with jesus-forward-output-dim reduced 1800->1700, +# jesus-hidden-dim reduced 7500 to 5000, and and the new option +# --self-repair-scale 0.00001 added. Also compare 5t and 5v which have even +# smaller jesus-hidden-dims. + +# _5s is as _5r but increasing the jesus-forward-output-dim to the intermediate +# value of 1700 (between 1500 and 1800), and also a bug-fix in the self-repair +# code to a bug which was doubling the thresholds so there was, in effect, +# no upper threshold. I stopped the p,q,r runs after I found this, but in +# configuring this run I'm bearing in mind the train and valid probs from the +# p,q,r runs. + +# _5r is as _5q but also reducing --jesus-hidden-dim from 7500 to 5000. + +# _5q is as _5p but reducing jesus-forward-output-dim from 1800 to 1500 to try +# to compensate for the fact that more of the output dimensions are now being +# usefully used. + +# _5p is as _5e but adding (new option) --self-repair-scale 0.00001, to repair +# ReLUs that are over or under-saturated. + +# _5e is as _5b, but reducing --xent-regularize from 0.2 to 0.1 (since based on +# the results of 4v, 4w and 5c, it looks like 0.1 is better than 0.2 or 0.05). + +# The improvement is small but consistent (0.1, 0.1, 0.0, 0.1) and also seen +# in the train and valid probs. +#System 5b 5e +#WER on train_dev(tg) 15.51 15.43 +#WER on train_dev(fg) 14.39 14.32 +#WER on eval2000(tg) 17.3 17.3 +#WER on eval2000(fg) 15.6 15.5 +#Final train prob -0.112013 -0.110056 +#Final valid prob -0.130879 -0.129184 + +# _5b is as _5a, but adding --leaky-hmm-coefficient 0.1. + +# It does seem helpful on average: (-0.35, -0.35, -0.1, 0). +#./compare_wer.sh 5a 5b +#System 5a 5b +#WER on train_dev(tg) 15.86 15.51 +#WER on train_dev(fg) 14.74 14.39 +#WER on eval2000(tg) 17.4 17.3 +#WER on eval2000(fg) 15.6 15.6 +#Final train prob -0.0998359 -0.112013 +#Final valid prob -0.115884 -0.130879 + +# _5a is as _4w, but increasing jesus-forward-output-dim from 1400 to 1800, and +# jesus-forward-input-dim from 400 to 500. Hoping that the cross-entropy regularization +# will mean that the increased parameters are now helpful. +# quite helpful: +#local/chain/compare_wer.sh 4w 5a +#System 4w 5a +#WER on train_dev(tg) 16.05 15.86 +#WER on train_dev(fg) 14.92 14.74 +#WER on eval2000(tg) 18.0 17.4 +#WER on eval2000(fg) 16.2 15.6 +#Final train prob -0.108816-0.0998359 +#Final valid prob -0.118254 -0.115884 + +# _4w is as _4v, but doubling --xent-regularize to 0.2 + +# _4v is as _4r, but with --xent-regularize 0.1. Increasing max_param_change +# from 1.0 to 2.0 because there is a lot of parameter change in the final xent +# layer, and this limits the rate of change of the other layers. + +# _4r is as _4f, but one more hidden layer, and reducing context of existing +# layers so we can re-use the egs. Reducing jesus-forward-output-dim slightly +# from 1500 to 1400. + +# This is better than 4f by almost all metrics. +# ./compare_wer.sh 4f 4r +# System 4f 4r +# WER on train_dev(tg) 16.83 16.50 +# WER on train_dev(fg) 15.73 15.45 +# WER on eval2000(tg) 18.4 18.3 +# WER on eval2000(fg) 16.6 16.7 +# Final train prob -0.105832 -0.103652 +# Final valid prob -0.123021 -0.121105 + +# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005. + +# It's even better than 4e, by about 0.3% abs. +# 4c 4e 4f +# Final valid prob: -0.1241 -0.1267 -0.1230 +# Final train prob: -0.08820 -0.1149 -0.1058 + +# ./show_wer.sh 4f +# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0 +# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 4e +# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0 +# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0 +# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys + + +# _4e is as _4c, but adding the option --l2-regularize 0.0001. + +# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000. + +# _4a is as _3s, but using narrower splice-indexes in the first layer. + +# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400. +# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p. +# This of course reduces overtraining. Results are a bit better than 3p but still +# not as good as 2y + +# ./show_wer.sh 3s +# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 3p +# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0 +# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0 +# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys +# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 2y +# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + + +# _3r is as _3p but reducing the number of parameters as it seemed to be +# overtraining (despite already being quite a small model): [600,1800 -> +# 500,1500]. Also in the interim there was a script change to +# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change. +# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent +# with the halving of the minibatch size.] + + +# _3p is the same as 3o, but after a code and script change so we can use +# natural gradient for the RepeatedAffineComponent. +# [natural gradient was helpful, based on logs; +# also made a change to use positive bias for the jesus-component affine parts.] + +# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2. + +# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support +# recurrence, with improvements to the learning of the jesus layers. + +# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found +# to be worse. +# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence +# is helpful.] +#./show_wer.sh 3g +#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys +#a03:s5c: ./show_wer.sh 2y +#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +#a03:s5c: ./show_wer.sh 3d +#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0 +#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +# _3f is as _3e, but modifying the splicing setup to add (left) recurrence: +# added the :3's in --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" +# Therefore it's +# no longer really a tdnn, more like an RNN combined with TDNN. BTW, I'm not re-dumping egs with extra +# context, and this isn't really ideal - I want to see if this seems promising first. + +# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default) +# to 200 in order to reduce computation in the Jesus layer. + +# _3d is as _2y, and re-using the egs, but using --jesus-opts and +# configs from make_jesus_configs.py. +# --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ +# --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" + +# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from +# 800k to 1.2 million. The aim is to avoid some of the per-job overhead +# (model-averaging, etc.), since each iteration takes only a minute or so. +# I added the results to the table below. It seems the same on average- +# which is good. We'll probably keep this configuration. + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# note: when I compare with the rerun of 2o (not shown), this run is actually +# better. +# WER on 2m 2o 2y [ now comparing 2o->2y:] +# train_dev,tg 17.22 17.24 16.99 0.2% better +# train_dev,fg 15.87 15.93 15.86 0.1% better +# eval2000,tg 18.7 18.7 18.9 0.2% worse +# eval2000,fg 17.0 16.9 17.0 0.1% worse + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_6g # Note: _sp will get added to this if $speed_perturb == true. + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=2.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --xent-regularize 0.1 \ + --leaky-hmm-coefficient 0.1 \ + --l2-regularize 0.00005 \ + --egs-dir exp/chain/tdnn_2y_sp/egs \ + --jesus-opts "--jesus-forward-input-dim 600 --jesus-forward-output-dim 1700 --jesus-hidden-dim 0 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25 --self-repair-scale 0.00001" \ + --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3 -3,0,3 -6,-3,0" \ + --apply-deriv-weights false \ + --frames-per-iter 1200000 \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context 20 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_6h.sh b/egs/swbd/s5c/local/chain/run_tdnn_6h.sh new file mode 100755 index 00000000000..f3065cec603 --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_6h.sh @@ -0,0 +1,494 @@ +#!/bin/bash + +# _6h is as _6g but adding --xent-separate-forward-affine=true, which +# gives a separate last-but-one weight matrix to the xent output. + +# Although this slight improvement is probably not significant, it's a +# sensible idea so I think I'll stick with it. +#local/chain/compare_wer.sh 6g 6h +#System 6g 6h +#WER on train_dev(tg) 15.50 15.46 +#WER on train_dev(fg) 14.31 14.28 +#WER on eval2000(tg) 17.5 17.4 +#WER on eval2000(fg) 15.8 15.7 +#Final train prob -0.105853 -0.105663 +#Final valid prob -0.129997 -0.130166 + +# _6g is as _6f but increasing the parameters (increasing +# jesus-forward-input-from from 500 to 600). + +# _6f is as _5v, but setting --jesus-hidden-dim to 0 which with a script change +# means there is no hidden part in the jesus layer (it's just repeated affine and relu). + +# slightly worse, but encouragingly small difference. +#local/chain/compare_wer.sh 5v 6f +#System 5v 6f +#WER on train_dev(tg) 15.38 15.71 +#WER on train_dev(fg) 14.39 14.50 +#WER on eval2000(tg) 17.4 17.5 +#WER on eval2000(fg) 15.7 15.9 +#Final train prob -0.11156 -0.111305 +#Final valid prob -0.131797 -0.131487 + +# _5v is as _5t, but further reducing the --jesus-hidden-dim from 3500 to 2500. + +# WER is almost the same, perhaps <0.1% worse; diagnostics are slightly worse. +# +#local/chain/compare_wer.sh 5e 5s 5t 5v +#System 5e 5s 5t 5v +#WER on train_dev(tg) 15.43 15.47 15.43 15.38 +#WER on train_dev(fg) 14.32 14.31 14.34 14.39 +#WER on eval2000(tg) 17.3 17.4 17.4 17.4 +#WER on eval2000(fg) 15.5 15.6 15.6 15.7 +#Final train prob -0.110056 -0.110928 -0.110752 -0.11156 +#Final valid prob -0.129184 -0.132139 -0.129123 -0.131797 + +# _5t is as _5s but further reducing the jesus-hidden-dim (trying to speed it +# up), from 5000 to 3500. + +# about 5s: comparing with 5e which is the most recent baseline we actually +# decoded, 5s is as 5e but with jesus-forward-output-dim reduced 1800->1700, +# jesus-hidden-dim reduced 7500 to 5000, and and the new option +# --self-repair-scale 0.00001 added. Also compare 5t and 5v which have even +# smaller jesus-hidden-dims. + +# _5s is as _5r but increasing the jesus-forward-output-dim to the intermediate +# value of 1700 (between 1500 and 1800), and also a bug-fix in the self-repair +# code to a bug which was doubling the thresholds so there was, in effect, +# no upper threshold. I stopped the p,q,r runs after I found this, but in +# configuring this run I'm bearing in mind the train and valid probs from the +# p,q,r runs. + +# _5r is as _5q but also reducing --jesus-hidden-dim from 7500 to 5000. + +# _5q is as _5p but reducing jesus-forward-output-dim from 1800 to 1500 to try +# to compensate for the fact that more of the output dimensions are now being +# usefully used. + +# _5p is as _5e but adding (new option) --self-repair-scale 0.00001, to repair +# ReLUs that are over or under-saturated. + +# _5e is as _5b, but reducing --xent-regularize from 0.2 to 0.1 (since based on +# the results of 4v, 4w and 5c, it looks like 0.1 is better than 0.2 or 0.05). + +# The improvement is small but consistent (0.1, 0.1, 0.0, 0.1) and also seen +# in the train and valid probs. +#System 5b 5e +#WER on train_dev(tg) 15.51 15.43 +#WER on train_dev(fg) 14.39 14.32 +#WER on eval2000(tg) 17.3 17.3 +#WER on eval2000(fg) 15.6 15.5 +#Final train prob -0.112013 -0.110056 +#Final valid prob -0.130879 -0.129184 + +# _5b is as _5a, but adding --leaky-hmm-coefficient 0.1. + +# It does seem helpful on average: (-0.35, -0.35, -0.1, 0). +#./compare_wer.sh 5a 5b +#System 5a 5b +#WER on train_dev(tg) 15.86 15.51 +#WER on train_dev(fg) 14.74 14.39 +#WER on eval2000(tg) 17.4 17.3 +#WER on eval2000(fg) 15.6 15.6 +#Final train prob -0.0998359 -0.112013 +#Final valid prob -0.115884 -0.130879 + +# _5a is as _4w, but increasing jesus-forward-output-dim from 1400 to 1800, and +# jesus-forward-input-dim from 400 to 500. Hoping that the cross-entropy regularization +# will mean that the increased parameters are now helpful. +# quite helpful: +#local/chain/compare_wer.sh 4w 5a +#System 4w 5a +#WER on train_dev(tg) 16.05 15.86 +#WER on train_dev(fg) 14.92 14.74 +#WER on eval2000(tg) 18.0 17.4 +#WER on eval2000(fg) 16.2 15.6 +#Final train prob -0.108816-0.0998359 +#Final valid prob -0.118254 -0.115884 + +# _4w is as _4v, but doubling --xent-regularize to 0.2 + +# _4v is as _4r, but with --xent-regularize 0.1. Increasing max_param_change +# from 1.0 to 2.0 because there is a lot of parameter change in the final xent +# layer, and this limits the rate of change of the other layers. + +# _4r is as _4f, but one more hidden layer, and reducing context of existing +# layers so we can re-use the egs. Reducing jesus-forward-output-dim slightly +# from 1500 to 1400. + +# This is better than 4f by almost all metrics. +# ./compare_wer.sh 4f 4r +# System 4f 4r +# WER on train_dev(tg) 16.83 16.50 +# WER on train_dev(fg) 15.73 15.45 +# WER on eval2000(tg) 18.4 18.3 +# WER on eval2000(fg) 16.6 16.7 +# Final train prob -0.105832 -0.103652 +# Final valid prob -0.123021 -0.121105 + +# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005. + +# It's even better than 4e, by about 0.3% abs. +# 4c 4e 4f +# Final valid prob: -0.1241 -0.1267 -0.1230 +# Final train prob: -0.08820 -0.1149 -0.1058 + +# ./show_wer.sh 4f +# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0 +# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 4e +# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0 +# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0 +# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys + + +# _4e is as _4c, but adding the option --l2-regularize 0.0001. + +# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000. + +# _4a is as _3s, but using narrower splice-indexes in the first layer. + +# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400. +# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p. +# This of course reduces overtraining. Results are a bit better than 3p but still +# not as good as 2y + +# ./show_wer.sh 3s +# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 3p +# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0 +# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0 +# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys +# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 2y +# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + + +# _3r is as _3p but reducing the number of parameters as it seemed to be +# overtraining (despite already being quite a small model): [600,1800 -> +# 500,1500]. Also in the interim there was a script change to +# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change. +# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent +# with the halving of the minibatch size.] + + +# _3p is the same as 3o, but after a code and script change so we can use +# natural gradient for the RepeatedAffineComponent. +# [natural gradient was helpful, based on logs; +# also made a change to use positive bias for the jesus-component affine parts.] + +# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2. + +# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support +# recurrence, with improvements to the learning of the jesus layers. + +# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found +# to be worse. +# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence +# is helpful.] +#./show_wer.sh 3g +#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys +#a03:s5c: ./show_wer.sh 2y +#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +#a03:s5c: ./show_wer.sh 3d +#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0 +#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +# _3f is as _3e, but modifying the splicing setup to add (left) recurrence: +# added the :3's in --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" +# Therefore it's +# no longer really a tdnn, more like an RNN combined with TDNN. BTW, I'm not re-dumping egs with extra +# context, and this isn't really ideal - I want to see if this seems promising first. + +# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default) +# to 200 in order to reduce computation in the Jesus layer. + +# _3d is as _2y, and re-using the egs, but using --jesus-opts and +# configs from make_jesus_configs.py. +# --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ +# --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" + +# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from +# 800k to 1.2 million. The aim is to avoid some of the per-job overhead +# (model-averaging, etc.), since each iteration takes only a minute or so. +# I added the results to the table below. It seems the same on average- +# which is good. We'll probably keep this configuration. + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# note: when I compare with the rerun of 2o (not shown), this run is actually +# better. +# WER on 2m 2o 2y [ now comparing 2o->2y:] +# train_dev,tg 17.22 17.24 16.99 0.2% better +# train_dev,fg 15.87 15.93 15.86 0.1% better +# eval2000,tg 18.7 18.7 18.9 0.2% worse +# eval2000,fg 17.0 16.9 17.0 0.1% worse + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_6h # Note: _sp will get added to this if $speed_perturb == true. + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=2.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --xent-regularize 0.1 \ + --leaky-hmm-coefficient 0.1 \ + --l2-regularize 0.00005 \ + --egs-dir exp/chain/tdnn_2y_sp/egs \ + --jesus-opts "--jesus-forward-input-dim 600 --jesus-forward-output-dim 1700 --jesus-hidden-dim 0 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25 --self-repair-scale 0.00001 --xent-separate-forward-affine=true" \ + --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3 -3,0,3 -6,-3,0" \ + --apply-deriv-weights false \ + --frames-per-iter 1200000 \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context 20 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_6h_discriminative.sh b/egs/swbd/s5c/local/chain/run_tdnn_6h_discriminative.sh new file mode 100755 index 00000000000..85afa7bf9ca --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_6h_discriminative.sh @@ -0,0 +1,265 @@ +#!/bin/bash + +set -o pipefail +set -e +# this is run_discriminative.sh + +# This script does discriminative training on top of chain nnet3 system. +# note: this relies on having a cluster that has plenty of CPUs as well as GPUs, +# since the lattice generation runs in about real-time, so takes of the order of +# 1000 hours of CPU time. +# +# eval2000 + +# chain 7b +# %WER 17.2 | 4459 42989 | 84.8 10.2 5.0 2.0 17.2 54.4 | exp/chain/tdnn_7b_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys + +# chain 7b + smbr +# %WER 16.9 | 4459 42989 | 85.2 10.3 4.5 2.1 16.9 54.4 | exp/chain/tdnn_7b_sp_smbr/decode_eval2000_sw1_tg_epoch1/score_11_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.9 | 4459 42989 | 85.4 10.5 4.1 2.3 16.9 54.2 | exp/chain/tdnn_7b_sp_smbr/decode_eval2000_sw1_tg_epoch2/score_11_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.0 | 4459 42989 | 85.3 10.4 4.3 2.3 17.0 54.5 | exp/chain/tdnn_7b_sp_smbr/decode_eval2000_sw1_tg_epoch3/score_12_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.1 | 4459 42989 | 85.2 10.5 4.3 2.4 17.1 54.5 | exp/chain/tdnn_7b_sp_smbr/decode_eval2000_sw1_tg_epoch4/score_12_0.5/eval2000_hires.ctm.filt.sys + +# chain 7b +# %WER 15.5 | 4459 42989 | 86.3 9.0 4.7 1.8 15.5 51.3 | exp/chain/tdnn_7b_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +# chain 7b + smbr +# %WER 15.2 | 4459 42989 | 86.8 9.1 4.1 2.0 15.2 51.2 | exp/chain/tdnn_7b_sp_smbr/decode_eval2000_sw1_fsh_fg_epoch1/score_11_0.0/eval2000_hires.ctm.filt.sys +# %WER 15.1 | 4459 42989 | 86.9 9.0 4.1 2.0 15.1 51.3 | exp/chain/tdnn_7b_sp_smbr/decode_eval2000_sw1_fsh_fg_epoch2/score_12_0.0/eval2000_hires.ctm.filt.sys +# %WER 15.1 | 4459 42989 | 87.0 9.1 3.9 2.1 15.1 51.2 | exp/chain/tdnn_7b_sp_smbr/decode_eval2000_sw1_fsh_fg_epoch3/score_12_0.5/eval2000_hires.ctm.filt.sys +# %WER 15.2 | 4459 42989 | 87.0 9.2 3.8 2.2 15.2 51.5 | exp/chain/tdnn_7b_sp_smbr/decode_eval2000_sw1_fsh_fg_epoch4/score_12_0.5/eval2000_hires.ctm.filt.sys + + +# RT'03 + +# chain 7b +# %WER 21.6 | 8420 76157 | 80.5 12.8 6.7 2.1 21.6 53.7 | exp/chain/tdnn_7b_sp/decode_rt03_sw1_tg/score_9_0.0/rt03_hires.ctm.filt.sys + +# chain 7b + smbr +# %WER 21.0 | 8420 76157 | 81.3 12.8 5.8 2.4 21.0 53.0 | exp/chain/tdnn_7b_sp_smbr/decode_rt03_sw1_tg_epoch1/score_10_0.0/rt03_hires.ctm.filt.sys +# %WER 20.8 | 8420 76157 | 81.6 12.5 6.0 2.4 20.8 53.0 | exp/chain/tdnn_7b_sp_smbr/decode_rt03_sw1_tg_epoch2/score_11_0.0/rt03_hires.ctm.filt.sys +# %WER 20.8 | 8420 76157 | 81.6 12.6 5.8 2.5 20.8 53.1 | exp/chain/tdnn_7b_sp_smbr/decode_rt03_sw1_tg_epoch3/score_11_0.5/rt03_hires.ctm.filt.sys +# %WER 20.9 | 8420 76157 | 81.7 12.7 5.6 2.6 20.9 53.2 | exp/chain/tdnn_7b_sp_smbr/decode_rt03_sw1_tg_epoch4/score_11_0.0/rt03_hires.ctm.filt.sys + +# chain 7b +# %WER 19.0 | 8420 76157 | 82.7 10.2 7.2 1.7 19.0 50.0 | exp/chain/tdnn_7b_sp/decode_rt03_sw1_fsh_fg/score_10_0.0/rt03_hires.ctm.filt.sys + +# chain 7b + smbr +# %WER 18.2 | 8420 76157 | 83.7 10.4 5.9 1.9 18.2 49.3 | exp/chain/tdnn_7b_sp_smbr/decode_rt03_sw1_fsh_fg_epoch1/score_11_0.0/rt03_hires.ctm.filt.sys +# %WER 18.1 | 8420 76157 | 83.9 10.7 5.4 2.1 18.1 49.3 | exp/chain/tdnn_7b_sp_smbr/decode_rt03_sw1_fsh_fg_epoch2/score_11_0.0/rt03_hires.ctm.filt.sys +# %WER 18.1 | 8420 76157 | 84.0 10.7 5.3 2.1 18.1 49.4 | exp/chain/tdnn_7b_sp_smbr/decode_rt03_sw1_fsh_fg_epoch3/score_11_1.0/rt03_hires.ctm.filt.sys +# %WER 18.2 | 8420 76157 | 83.8 10.5 5.7 2.1 18.2 49.6 | exp/chain/tdnn_7b_sp_smbr/decode_rt03_sw1_fsh_fg_epoch4/score_12_1.0/rt03_hires.ctm.filt.sys + +. cmd.sh + +stage=0 +train_stage=-10 # can be used to start training in the middle. +get_egs_stage=-10 +use_gpu=true # for training +cleanup=false # run with --cleanup true --stage 6 to clean up (remove large things like denlats, + # alignments and degs). + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +srcdir=exp/chain/tdnn_7b_sp +train_data_dir=data/train_nodup_sp_hires +online_ivector_dir=exp/nnet3/ivectors_train_nodup_sp +degs_dir= # If provided, will skip the degs directory creation +lats_dir= # If provided, will skip denlats creation + +## Objective options +criterion=smbr +one_silence_class=true + +dir=${srcdir}_${criterion} + +## Egs options +frames_per_eg=150 +frames_overlap_per_eg=30 +truncate_deriv_weights=10 + +## Nnet training options +effective_learning_rate=0.000000125 +max_param_change=1 +num_jobs_nnet=4 +num_epochs=4 +regularization_opts="--xent-regularize=0.1 --l2-regularize=0.00005" # Applicable for providing --xent-regularize and --l2-regularize options +minibatch_size=64 + +## Decode options +decode_start_epoch=1 # can be used to avoid decoding all epochs, e.g. if we decided to run more. + +if $use_gpu; then + if ! cuda-compiled; then + cat </dev/null || true + + data_dirs= + for x in `seq -$[frame_subsampling_factor/2] $[frame_subsampling_factor/2]`; do + steps/shift_feats.sh --cmd "$train_cmd --max-jobs-run 40" --nj 350 \ + $x $train_data_dir exp/shift_hires/ mfcc_hires + utils/fix_data_dir.sh ${train_data_dir}_fs$x + data_dirs="$data_dirs ${train_data_dir}_fs$x" + awk -v nfs=$x '{print "fs"nfs"-"$0}' $online_ivector_dir/ivector_online.scp >> ${online_ivector_dir}_fs/ivector_online.scp + done + utils/combine_data.sh ${train_data_dir}_fs $data_dirs + for x in `seq -$[frame_subsampling_factor/2] $[frame_subsampling_factor/2]`; do + rm -r ${train_data_dir}_fs$x + done + fi + + train_data_dir=${train_data_dir}_fs + + affix=_fs +fi + +rm ${online_ivector_dir}_fs/ivector_online.scp 2>/dev/null || true +for x in `seq -$[frame_subsampling_factor/2] $[frame_subsampling_factor/2]`; do + awk -v nfs=$x '{print "fs"nfs"-"$0}' $online_ivector_dir/ivector_online.scp >> ${online_ivector_dir}_fs/ivector_online.scp +done +online_ivector_dir=${online_ivector_dir}_fs + +if [ $stage -le 1 ]; then + # hardcode no-GPU for alignment, although you could use GPU [you wouldn't + # get excellent GPU utilization though.] + nj=350 # have a high number of jobs because this could take a while, and we might + # have some stragglers. + steps/nnet3/align.sh --cmd "$decode_cmd" --use-gpu false \ + --online-ivector-dir $online_ivector_dir \ + --scale-opts "--transition-scale=1.0 --acoustic-scale=1.0 --self-loop-scale=1.0" \ + --nj $nj $train_data_dir $lang $srcdir ${srcdir}_ali${affix} ; +fi + +if [ -z "$lats_dir" ]; then + lats_dir=${srcdir}_denlats${affix} + if [ $stage -le 2 ]; then + nj=50 + # this doesn't really affect anything strongly, except the num-jobs for one of + # the phases of get_egs_discriminative.sh below. + num_threads_denlats=6 + subsplit=40 # number of jobs that run per job (but 2 run at a time, so total jobs is 80, giving + # total slots = 80 * 6 = 480. + steps/nnet3/make_denlats.sh --cmd "$decode_cmd" \ + --self-loop-scale 1.0 --acwt 1.0 --determinize true \ + --online-ivector-dir $online_ivector_dir \ + --nj $nj --sub-split $subsplit --num-threads "$num_threads_denlats" --config conf/decode.config \ + $train_data_dir $lang $srcdir ${lats_dir} ; + fi +fi + +model_left_context=`nnet3-am-info $srcdir/final.mdl | grep "left-context:" | awk '{print $2}'` +model_right_context=`nnet3-am-info $srcdir/final.mdl | grep "right-context:" | awk '{print $2}'` + +left_context=$[model_left_context + extra_left_context] +right_context=$[model_right_context + extra_right_context] + +valid_left_context=$[valid_left_context + frames_per_eg] +valid_right_context=$[valid_right_context + frames_per_eg] + +cmvn_opts=`cat $srcdir/cmvn_opts` + +if [ -z "$degs_dir" ]; then + degs_dir=${srcdir}_degs${affix} + + if [ $stage -le 3 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d ${srcdir}_degs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{1,2,12,13}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5/${srcdir}_degs/storage ${srcdir}_degs/storage + fi + # have a higher maximum num-jobs if + if [ -d ${srcdir}_degs/storage ]; then max_jobs=10; else max_jobs=5; fi + + degs_opts="--determinize true --minimize true --remove-output-symbols true --remove-epsilons true --collapse-transition-ids true" + + steps/nnet3/get_egs_discriminative.sh \ + --cmd "$decode_cmd --max-jobs-run $max_jobs --mem 20G" --stage $get_egs_stage --cmvn-opts "$cmvn_opts" \ + --adjust-priors false --acwt 1.0 \ + --online-ivector-dir $online_ivector_dir \ + --left-context $left_context --right-context $right_context \ + --valid-left-context $valid_left_context --valid-right-context $valid_right_context \ + --priors-left-context $valid_left_context --priors-right-context $valid_right_context $frame_subsampling_opt \ + --frames-per-eg $frames_per_eg --frames-overlap-per-eg $frames_overlap_per_eg ${degs_opts} \ + $train_data_dir $lang ${srcdir}_ali${affix} $lats_dir $srcdir/final.mdl $degs_dir ; + fi +fi + +if [ $stage -le 4 ]; then + steps/nnet3/train_discriminative.sh --cmd "$decode_cmd" \ + --stage $train_stage \ + --effective-lrate $effective_learning_rate --max-param-change $max_param_change \ + --criterion $criterion --drop-frames true --acoustic-scale 1.0 \ + --num-epochs $num_epochs --one-silence-class $one_silence_class --minibatch-size $minibatch_size \ + --num-jobs-nnet $num_jobs_nnet --num-threads $num_threads \ + --regularization-opts "$regularization_opts" --use-frame-shift false \ + --truncate-deriv-weights $truncate_deriv_weights --adjust-priors false \ + --modify-learning-rates false \ + ${degs_dir} $dir ; +fi + +graph_dir=$srcdir/graph_sw1_tg +if [ $stage -le 5 ]; then + for x in `seq $decode_start_epoch $num_epochs`; do + for decode_set in train_dev eval2000 rt03; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + iter=epoch$x.adj + + steps/nnet3/decode.sh --nj $num_jobs --cmd "$decode_cmd" --iter $iter \ + --acwt 1.0 --post-decode-acwt 10.0 \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_sw1_tg_$iter ; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg}_$iter ; + fi + ) & + done + done +fi +wait; + +if [ $stage -le 6 ] && $cleanup; then + # if you run with "--cleanup true --stage 6" you can clean up. + rm ${lats_dir}/lat.*.gz || true + rm ${srcdir}_ali/ali.*.gz || true + steps/nnet2/remove_egs.sh ${srcdir}_degs || true +fi + + +exit 0; + diff --git a/egs/swbd/s5c/local/chain/run_tdnn_6h_py.sh b/egs/swbd/s5c/local/chain/run_tdnn_6h_py.sh new file mode 100755 index 00000000000..a5a96de7f38 --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_6h_py.sh @@ -0,0 +1,177 @@ +#!/bin/bash + +# this is a replica of_6h script, but makes use of the python trainer +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_6h_py # Note: _sp will get added to this if $speed_perturb == true. + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=2.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false +xent_regularize=0.1 + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + echo "$0: creating neural net configs"; + + steps/nnet3/make_jesus_configs.py \ + --feat-dir data/${train_set}_hires \ + --ivector-dir exp/nnet3/ivectors_${train_set} \ + --tree-dir $treedir \ + --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3 -3,0,3 -6,-3,0" \ + --jesus-forward-input-dim 600 \ + --jesus-forward-output-dim 1700 \ + --jesus-hidden-dim 0 \ + --jesus-stddev-scale 0.2 \ + --final-layer-learning-rate-factor 0.25 \ + --self-repair-scale 0.00001 \ + --xent-separate-forward-affine=true \ + --xent-regularize=$xent_regularize \ + --include-log-softmax=false \ + $dir/configs || exit 1; +fi + +if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.dir exp/chain/tdnn_2y_sp/egs \ + --egs.stage $get_egs_stage \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width $frames_per_eg \ + --trainer.num-chunk-per-minibatch $minibatch_size \ + --trainer.frames-per-iter 1200000 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.num-jobs-initial $num_jobs_initial \ + --trainer.optimization.num-jobs-final $num_jobs_final \ + --trainer.optimization.initial-effective-lrate $initial_effective_lrate \ + --trainer.optimization.final-effective-lrate $final_effective_lrate \ + --trainer.max-param-change $max_param_change \ + --cleanup.remove-egs $remove_egs \ + --feat-dir data/${train_set}_hires \ + --tree-dir $treedir \ + --lat-dir exp/tri4_lats_nodup$suffix \ + --dir $dir || exit 1; +fi + +if [ $stage -le 14 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 15 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context 20 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_6i.sh b/egs/swbd/s5c/local/chain/run_tdnn_6i.sh new file mode 100755 index 00000000000..457b424be73 --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_6i.sh @@ -0,0 +1,497 @@ +#!/bin/bash + +# _6i takes aspects from 5n and 6g. Like 6g it uses a 'thin' jesus-layer +# (no hidden dimension), and like 5n it uses a non-standard frame shift at the +# input, but this frame shift is 7.5 ms rather than 5ms (5n) or 10ms (6h). +# the idea is that this allows us to subsample the input frames by a factor +# of 4, rather than 3, and since 4 = 2 * 2, we can do the subsampling +# in two stages. You'll see this reflected in the splice indexes. + +# local/chain/compare_wer.sh 6g 6i +# System 6g 6i +# WER on train_dev(tg) 15.50 15.62 +# WER on train_dev(fg) 14.31 14.46 +# WER on eval2000(tg) 17.5 17.3 +# WER on eval2000(fg) 15.8 15.8 +# Final train prob -0.105853 -0.10417 +# Final valid prob -0.129997 -0.123985 + +# _5n is as _5j (also omitting the iVectors), but using double the input frame +# rate from 10 to 5 ms (and reducing frame width from 25 to 20), and modifying +# the splice indexes accordingly. +# note: the frames-per-iter should be 1.6 million to get the same amount of +# data per iteration, but I'm making it 2 million as the training per is getting +# faster than I like (-> wasting time waiting for the queue). + +# A very nice improvement on dev; small improvement on eval2000 though. +#local/chain/compare_wer.sh 5j 5n +#System 5j 5n +#WER on train_dev(tg) 17.59 16.85 +#WER on train_dev(fg) 16.33 15.67 +#WER on eval2000(tg) 19.1 19.1 +#WER on eval2000(fg) 17.5 17.3 +#Final train prob -0.114691 -0.116341 +#Final valid prob -0.130761 -0.130884 + +# _5j is as _5e, but omitting the iVectors. + +# Definitely worse, although curiously, there is very little effect on the valid prob. +#./compare_wer.sh 5e 5j +#System 5e 5j +#WER on train_dev(tg) 15.43 17.59 +#WER on train_dev(fg) 14.32 16.33 +#WER on eval2000(tg) 17.3 19.1 +#WER on eval2000(fg) 15.5 17.5 +#Final train prob -0.110056 -0.114691 +#Final valid prob -0.129184 -0.130761 + + +# _5e is as _5b, but reducing --xent-regularize from 0.2 to 0.1 (since based on +# the results of 4v, 4w and 5c, it looks like 0.1 is better than 0.2 or 0.05). + +# The improvement is small but consistent (0.1, 0.1, 0.0, 0.1) and also seen +# in the train and valid probs. +#System 5b 5e +#WER on train_dev(tg) 15.51 15.43 +#WER on train_dev(fg) 14.39 14.32 +#WER on eval2000(tg) 17.3 17.3 +#WER on eval2000(fg) 15.6 15.5 +#Final train prob -0.112013 -0.110056 +#Final valid prob -0.130879 -0.129184 + +# _5b is as _5a, but adding --leaky-hmm-coefficient 0.1. + +# It does seem helpful on average: (-0.35, -0.35, -0.1, 0). +#./compare_wer.sh 5a 5b +#System 5a 5b +#WER on train_dev(tg) 15.86 15.51 +#WER on train_dev(fg) 14.74 14.39 +#WER on eval2000(tg) 17.4 17.3 +#WER on eval2000(fg) 15.6 15.6 +#Final train prob -0.0998359 -0.112013 +#Final valid prob -0.115884 -0.130879 + +# _5a is as _4w, but increasing jesus-forward-output-dim from 1400 to 1800, and +# jesus-forward-input-dim from 400 to 500. Hoping that the cross-entropy regularization +# will mean that the increased parameters are now helpful. + +# _4w is as _4v, but doubling --xent-regularize to 0.2 + +# _4v is as _4r, but with --xent-regularize 0.1. Increasing max_param_change +# from 1.0 to 2.0 because there is a lot of parameter change in the final xent +# layer, and this limits the rate of change of the other layers. + +# _4r is as _4f, but one more hidden layer, and reducing context of existing +# layers so we can re-use the egs. Reducing jesus-forward-output-dim slightly +# from 1500 to 1400. + +# This is better than 4f by almost all metrics. +# ./compare_wer.sh 4f 4r +# System 4f 4r +# WER on train_dev(tg) 16.83 16.50 +# WER on train_dev(fg) 15.73 15.45 +# WER on eval2000(tg) 18.4 18.3 +# WER on eval2000(fg) 16.6 16.7 +# Final train prob -0.105832 -0.103652 +# Final valid prob -0.123021 -0.121105 + +# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005. + +# It's even better than 4e, by about 0.3% abs. +# 4c 4e 4f +# Final valid prob: -0.1241 -0.1267 -0.1230 +# Final train prob: -0.08820 -0.1149 -0.1058 + +# ./show_wer.sh 4f +# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0 +# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 4e +# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0 +# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0 +# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys + + +# _4e is as _4c, but adding the option --l2-regularize 0.0001. + +# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000. + +# _4a is as _3s, but using narrower splice-indexes in the first layer. + +# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400. +# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p. +# This of course reduces overtraining. Results are a bit better than 3p but still +# not as good as 2y + +# ./show_wer.sh 3s +# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 3p +# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0 +# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0 +# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys +# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 2y +# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + + +# _3r is as _3p but reducing the number of parameters as it seemed to be +# overtraining (despite already being quite a small model): [600,1800 -> +# 500,1500]. Also in the interim there was a script change to +# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change. +# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent +# with the halving of the minibatch size.] + + +# _3p is the same as 3o, but after a code and script change so we can use +# natural gradient for the RepeatedAffineComponent. +# [natural gradient was helpful, based on logs; +# also made a change to use positive bias for the jesus-component affine parts.] + +# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2. + +# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support +# recurrence, with improvements to the learning of the jesus layers. + +# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found +# to be worse. +# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence +# is helpful.] +#./show_wer.sh 3g +#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys +#a03:s5c: ./show_wer.sh 2y +#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +#a03:s5c: ./show_wer.sh 3d +#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0 +#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +# _3f is as _3e, but modifying the splicing setup to add (left) recurrence: +# added the :3's in --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" +# Therefore it's +# no longer really a tdnn, more like an RNN combined with TDNN. BTW, I'm not re-dumping egs with extra +# context, and this isn't really ideal - I want to see if this seems promising first. + +# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default) +# to 200 in order to reduce computation in the Jesus layer. + +# _3d is as _2y, and re-using the egs, but using --jesus-opts and +# configs from make_jesus_configs.py. +# --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ +# --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" + +# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from +# 800k to 1.2 million. The aim is to avoid some of the per-job overhead +# (model-averaging, etc.), since each iteration takes only a minute or so. +# I added the results to the table below. It seems the same on average- +# which is good. We'll probably keep this configuration. + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# note: when I compare with the rerun of 2o (not shown), this run is actually +# better. +# WER on 2m 2o 2y [ now comparing 2o->2y:] +# train_dev,tg 17.22 17.24 16.99 0.2% better +# train_dev,fg 15.87 15.93 15.86 0.1% better +# eval2000,tg 18.7 18.7 18.9 0.2% worse +# eval2000,fg 17.0 16.9 17.0 0.1% worse + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_6i # Note: _sp will get added to this if $speed_perturb == true. + +# training options +num_epochs=3 # this is about the same amount of compute as the normal 4, since + # epoch encompasses all frame-shifts of the data and we now have 4 + # frames-shifts rather than 3. (3 * 4 == 4 * 3). +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=2.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=200 # 20 is equivalent to 150 at 10ms frame rate. +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +# Generate faster-frame-rate (7.5 ms frame shift) version of the data. +if [ $stage -le 12 ]; then + mfccdir=mfcc + for dataset in eval2000 train_dev ${train_set}; do + utils/copy_data_dir.sh data/$dataset data/${dataset}_hiresf + steps/make_mfcc.sh --cmd "$train_cmd" --nj 30 --mfcc-config conf/mfcc_hiresf.conf \ + data/${dataset}_hiresf exp/make_hiresf/$dataset $mfccdir; + steps/compute_cmvn_stats.sh data/${dataset}_hiresf exp/make_hiresf/$dataset $mfccdir; + utils/fix_data_dir.sh data/${dataset}_hiresf # remove segments with problems + done +fi + +if [ $stage -le 13 ]; then + for dataset in eval2000 train_dev ${train_set}; do + mkdir -p exp/nnet3/ivectors_${dataset}_fake + cp exp/nnet3/ivectors_${dataset}/ivector_online.scp exp/nnet3/ivectors_${dataset}_fake + # verify that the old ivector_period was 10. + [ $(cat exp/nnet3/ivectors_${dataset}/ivector_period) -eq 10 ] || exit 1 + # the ivector_period would have to be 13.333 to get the exact same rate. + # set it to 14 (slightly over) as less likely to produce errors in decoding. + echo 14 > exp/nnet3/ivectors_${dataset}_fake/ivector_period + done + # for the training set, use 13 as the ivector_period... this avoids + # errors for some longer utterances (the code checks the matching + # in a slightly different way). none of this would be necessary + # if we generated iVectors using the same frame shift. + echo 13 > exp/nnet3/ivectors_${train_set}_fake/ivector_period +fi + +if [ $stage -le 14 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --frame-subsampling-factor 4 \ + --alignment-subsampling-factor 3 \ + --xent-regularize 0.1 \ + --leaky-hmm-coefficient 0.1 \ + --l2-regularize 0.00005 \ + --jesus-opts "--jesus-forward-input-dim 600 --jesus-forward-output-dim 1700 --jesus-hidden-dim 0 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25 --self-repair-scale 0.00001" \ + --splice-indexes "-1,0,1 -2,-1,0,1,2 -4,-2,0,2 -4,0,4 -4,0,4 -4,0,4" \ + --apply-deriv-weights false \ + --frames-per-iter 2000000 \ + --online-ivector-dir exp/nnet3/ivectors_${train_set}_fake \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --feat-type raw \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hiresf $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; + + echo "0.0075" > $dir/frame_shift # this lets the sclite decoding script know + # what the frame shift was, in seconds. +fi + +if [ $stage -le 15 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 16 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set}_fake \ + $graph_dir data/${decode_set}_hiresf $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hiresf \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_6j.sh b/egs/swbd/s5c/local/chain/run_tdnn_6j.sh new file mode 100755 index 00000000000..ded13de9470 --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_6j.sh @@ -0,0 +1,482 @@ +#!/bin/bash + +# _6j is another baseline for _6i, in which we use regular features (10 ms frame +# shift) with the 4-fold subsampling of 6i. I don't expect this will be as +# good, but it will be nice to have confirmation that the lower sampling +# rate is actually helpful. +# reducing frames-per-eg from 200 to 150 and --frames-per-iter from +# 2 million to 1.5 million. + +# Hm- the difference is surprisingly small, about 0.2% worse on average. +#local/chain/compare_wer.sh 6i 6j +#System 6i 6j +#WER on train_dev(tg) 15.62 15.86 +#WER on train_dev(fg) 14.46 14.79 +#WER on eval2000(tg) 17.3 17.6 +#WER on eval2000(fg) 15.8 15.8 +#Final train prob -0.10417 -0.131444 +#Final valid prob -0.123985 -0.167574 +#Final train prob (xent) -1.60566 -1.45908 +#Final valid prob (xent) -1.67945 -1.55937 + +# _6i takes aspects from 5n and 6h. Like 6h it uses a 'thin' jesus-layer +# (no hidden dimension), and like 5n it uses a non-standard frame shift at the +# input, but this frame shift is 7.5 ms rather than 5ms (5n) or 10ms (6h). +# the idea is that this allows us to subsample the input frames by a factor +# of 4, rather than 3, and since 4 = 2 * 2, we can do the subsampling +# in two stages. You'll see this reflected in the splice indexes. +# Some notes: +# - we had the choose the splice indexes; we have 1 hidden layer at +# base frame rate, 2 at + +# _5n is as _5j (also omitting the iVectors), but using double the input frame +# rate from 10 to 5 ms (and reducing frame width from 25 to 20), and modifying +# the splice indexes accordingly. +# note: the frames-per-iter should be 1.6 million to get the same amount of +# data per iteration, but I'm making it 2 million as the training per is getting +# faster than I like (-> wasting time waiting for the queue). + +# A very nice improvement on dev; small improvement on eval2000 though. +#local/chain/compare_wer.sh 5j 5n +#System 5j 5n +#WER on train_dev(tg) 17.59 16.85 +#WER on train_dev(fg) 16.33 15.67 +#WER on eval2000(tg) 19.1 19.1 +#WER on eval2000(fg) 17.5 17.3 +#Final train prob -0.114691 -0.116341 +#Final valid prob -0.130761 -0.130884 + +# _5j is as _5e, but omitting the iVectors. + +# Definitely worse, although curiously, there is very little effect on the valid prob. +#./compare_wer.sh 5e 5j +#System 5e 5j +#WER on train_dev(tg) 15.43 17.59 +#WER on train_dev(fg) 14.32 16.33 +#WER on eval2000(tg) 17.3 19.1 +#WER on eval2000(fg) 15.5 17.5 +#Final train prob -0.110056 -0.114691 +#Final valid prob -0.129184 -0.130761 + + +# _5e is as _5b, but reducing --xent-regularize from 0.2 to 0.1 (since based on +# the results of 4v, 4w and 5c, it looks like 0.1 is better than 0.2 or 0.05). + +# The improvement is small but consistent (0.1, 0.1, 0.0, 0.1) and also seen +# in the train and valid probs. +#System 5b 5e +#WER on train_dev(tg) 15.51 15.43 +#WER on train_dev(fg) 14.39 14.32 +#WER on eval2000(tg) 17.3 17.3 +#WER on eval2000(fg) 15.6 15.5 +#Final train prob -0.112013 -0.110056 +#Final valid prob -0.130879 -0.129184 + +# _5b is as _5a, but adding --leaky-hmm-coefficient 0.1. + +# It does seem helpful on average: (-0.35, -0.35, -0.1, 0). +#./compare_wer.sh 5a 5b +#System 5a 5b +#WER on train_dev(tg) 15.86 15.51 +#WER on train_dev(fg) 14.74 14.39 +#WER on eval2000(tg) 17.4 17.3 +#WER on eval2000(fg) 15.6 15.6 +#Final train prob -0.0998359 -0.112013 +#Final valid prob -0.115884 -0.130879 + +# _5a is as _4w, but increasing jesus-forward-output-dim from 1400 to 1800, and +# jesus-forward-input-dim from 400 to 500. Hoping that the cross-entropy regularization +# will mean that the increased parameters are now helpful. + +# _4w is as _4v, but doubling --xent-regularize to 0.2 + +# _4v is as _4r, but with --xent-regularize 0.1. Increasing max_param_change +# from 1.0 to 2.0 because there is a lot of parameter change in the final xent +# layer, and this limits the rate of change of the other layers. + +# _4r is as _4f, but one more hidden layer, and reducing context of existing +# layers so we can re-use the egs. Reducing jesus-forward-output-dim slightly +# from 1500 to 1400. + +# This is better than 4f by almost all metrics. +# ./compare_wer.sh 4f 4r +# System 4f 4r +# WER on train_dev(tg) 16.83 16.50 +# WER on train_dev(fg) 15.73 15.45 +# WER on eval2000(tg) 18.4 18.3 +# WER on eval2000(fg) 16.6 16.7 +# Final train prob -0.105832 -0.103652 +# Final valid prob -0.123021 -0.121105 + +# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005. + +# It's even better than 4e, by about 0.3% abs. +# 4c 4e 4f +# Final valid prob: -0.1241 -0.1267 -0.1230 +# Final train prob: -0.08820 -0.1149 -0.1058 + +# ./show_wer.sh 4f +# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0 +# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 4e +# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0 +# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0 +# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys + + +# _4e is as _4c, but adding the option --l2-regularize 0.0001. + +# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000. + +# _4a is as _3s, but using narrower splice-indexes in the first layer. + +# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400. +# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p. +# This of course reduces overtraining. Results are a bit better than 3p but still +# not as good as 2y + +# ./show_wer.sh 3s +# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 3p +# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0 +# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0 +# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys +# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 2y +# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + + +# _3r is as _3p but reducing the number of parameters as it seemed to be +# overtraining (despite already being quite a small model): [600,1800 -> +# 500,1500]. Also in the interim there was a script change to +# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change. +# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent +# with the halving of the minibatch size.] + + +# _3p is the same as 3o, but after a code and script change so we can use +# natural gradient for the RepeatedAffineComponent. +# [natural gradient was helpful, based on logs; +# also made a change to use positive bias for the jesus-component affine parts.] + +# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2. + +# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support +# recurrence, with improvements to the learning of the jesus layers. + +# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found +# to be worse. +# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence +# is helpful.] +#./show_wer.sh 3g +#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys +#a03:s5c: ./show_wer.sh 2y +#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +#a03:s5c: ./show_wer.sh 3d +#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0 +#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +# _3f is as _3e, but modifying the splicing setup to add (left) recurrence: +# added the :3's in --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" +# Therefore it's +# no longer really a tdnn, more like an RNN combined with TDNN. BTW, I'm not re-dumping egs with extra +# context, and this isn't really ideal - I want to see if this seems promising first. + +# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default) +# to 200 in order to reduce computation in the Jesus layer. + +# _3d is as _2y, and re-using the egs, but using --jesus-opts and +# configs from make_jesus_configs.py. +# --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ +# --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" + +# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from +# 800k to 1.2 million. The aim is to avoid some of the per-job overhead +# (model-averaging, etc.), since each iteration takes only a minute or so. +# I added the results to the table below. It seems the same on average- +# which is good. We'll probably keep this configuration. + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# note: when I compare with the rerun of 2o (not shown), this run is actually +# better. +# WER on 2m 2o 2y [ now comparing 2o->2y:] +# train_dev,tg 17.22 17.24 16.99 0.2% better +# train_dev,fg 15.87 15.93 15.86 0.1% better +# eval2000,tg 18.7 18.7 18.9 0.2% worse +# eval2000,fg 17.0 16.9 17.0 0.1% worse + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=11 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_6j # Note: _sp will get added to this if $speed_perturb == true. + +# training options +num_epochs=3 # this is about the same amount of compute as the normal 4, since + # epoch encompasses all frame-shifts of the data and we now have 4 + # frames-shifts rather than 3. (3 * 4 == 4 * 3). +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=2.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 4 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --frame-subsampling-factor 4 \ + --alignment-subsampling-factor 4 \ + --xent-regularize 0.1 \ + --leaky-hmm-coefficient 0.1 \ + --l2-regularize 0.00005 \ + --jesus-opts "--jesus-forward-input-dim 600 --jesus-forward-output-dim 1700 --jesus-hidden-dim 0 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25 --self-repair-scale 0.00001 --xent-separate-forward-affine=true" \ + --splice-indexes "-1,0,1 -2,-1,0,1,2 -4,-2,0,2 -4,0,4 -4,0,4 -4,0,4" \ + --apply-deriv-weights false \ + --frames-per-iter 1500000 \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --feat-type raw \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; + + echo "0.0075" > $dir/frame_shift # this lets the sclite decoding script know + # what the frame shift was, in seconds. +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_6k.sh b/egs/swbd/s5c/local/chain/run_tdnn_6k.sh new file mode 100755 index 00000000000..4625da200e6 --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_6k.sh @@ -0,0 +1,509 @@ +#!/bin/bash + +# _6k is as _6i, but one more epoch. After running the first few stages, I'm +# copying the last model from 6i and starting from that point, to save compute. +# No better. +#local/chain/compare_wer.sh 6i 6k +#System 6i 6k +#WER on train_dev(tg) 15.62 15.67 +#WER on train_dev(fg) 14.46 14.47 +#WER on eval2000(tg) 17.3 17.4 +#WER on eval2000(fg) 15.8 15.8 +#Final train prob -0.10417-0.0994163 +#Final valid prob -0.123985 -0.122743 + +# _6i takes aspects from 5n and 6h. Like 6h it uses a 'thin' jesus-layer +# (no hidden dimension), and like 5n it uses a non-standard frame shift at the +# input, but this frame shift is 7.5 ms rather than 5ms (5n) or 10ms (6h). +# the idea is that this allows us to subsample the input frames by a factor +# of 4, rather than 3, and since 4 = 2 * 2, we can do the subsampling +# in two stages. You'll see this reflected in the splice indexes. + +# local/chain/compare_wer.sh 6h 6i +# System 6h 6i +# WER on train_dev(tg) 15.46 15.62 +# WER on train_dev(fg) 14.28 14.46 +# WER on eval2000(tg) 17.4 17.3 +# WER on eval2000(fg) 15.7 15.8 +# Final train prob -0.105663 -0.10417 +# Final valid prob -0.130166 -0.123985 + + +# _5n is as _5j (also omitting the iVectors), but using double the input frame +# rate from 10 to 5 ms (and reducing frame width from 25 to 20), and modifying +# the splice indexes accordingly. +# note: the frames-per-iter should be 1.6 million to get the same amount of +# data per iteration, but I'm making it 2 million as the training per is getting +# faster than I like (-> wasting time waiting for the queue). + +# A very nice improvement on dev; small improvement on eval2000 though. +#local/chain/compare_wer.sh 5j 5n +#System 5j 5n +#WER on train_dev(tg) 17.59 16.85 +#WER on train_dev(fg) 16.33 15.67 +#WER on eval2000(tg) 19.1 19.1 +#WER on eval2000(fg) 17.5 17.3 +#Final train prob -0.114691 -0.116341 +#Final valid prob -0.130761 -0.130884 + +# _5j is as _5e, but omitting the iVectors. + +# Definitely worse, although curiously, there is very little effect on the valid prob. +#./compare_wer.sh 5e 5j +#System 5e 5j +#WER on train_dev(tg) 15.43 17.59 +#WER on train_dev(fg) 14.32 16.33 +#WER on eval2000(tg) 17.3 19.1 +#WER on eval2000(fg) 15.5 17.5 +#Final train prob -0.110056 -0.114691 +#Final valid prob -0.129184 -0.130761 + + +# _5e is as _5b, but reducing --xent-regularize from 0.2 to 0.1 (since based on +# the results of 4v, 4w and 5c, it looks like 0.1 is better than 0.2 or 0.05). + +# The improvement is small but consistent (0.1, 0.1, 0.0, 0.1) and also seen +# in the train and valid probs. +#System 5b 5e +#WER on train_dev(tg) 15.51 15.43 +#WER on train_dev(fg) 14.39 14.32 +#WER on eval2000(tg) 17.3 17.3 +#WER on eval2000(fg) 15.6 15.5 +#Final train prob -0.112013 -0.110056 +#Final valid prob -0.130879 -0.129184 + +# _5b is as _5a, but adding --leaky-hmm-coefficient 0.1. + +# It does seem helpful on average: (-0.35, -0.35, -0.1, 0). +#./compare_wer.sh 5a 5b +#System 5a 5b +#WER on train_dev(tg) 15.86 15.51 +#WER on train_dev(fg) 14.74 14.39 +#WER on eval2000(tg) 17.4 17.3 +#WER on eval2000(fg) 15.6 15.6 +#Final train prob -0.0998359 -0.112013 +#Final valid prob -0.115884 -0.130879 + +# _5a is as _4w, but increasing jesus-forward-output-dim from 1400 to 1800, and +# jesus-forward-input-dim from 400 to 500. Hoping that the cross-entropy regularization +# will mean that the increased parameters are now helpful. + +# _4w is as _4v, but doubling --xent-regularize to 0.2 + +# _4v is as _4r, but with --xent-regularize 0.1. Increasing max_param_change +# from 1.0 to 2.0 because there is a lot of parameter change in the final xent +# layer, and this limits the rate of change of the other layers. + +# _4r is as _4f, but one more hidden layer, and reducing context of existing +# layers so we can re-use the egs. Reducing jesus-forward-output-dim slightly +# from 1500 to 1400. + +# This is better than 4f by almost all metrics. +# ./compare_wer.sh 4f 4r +# System 4f 4r +# WER on train_dev(tg) 16.83 16.50 +# WER on train_dev(fg) 15.73 15.45 +# WER on eval2000(tg) 18.4 18.3 +# WER on eval2000(fg) 16.6 16.7 +# Final train prob -0.105832 -0.103652 +# Final valid prob -0.123021 -0.121105 + +# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005. + +# It's even better than 4e, by about 0.3% abs. +# 4c 4e 4f +# Final valid prob: -0.1241 -0.1267 -0.1230 +# Final train prob: -0.08820 -0.1149 -0.1058 + +# ./show_wer.sh 4f +# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0 +# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 4e +# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0 +# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0 +# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys + + +# _4e is as _4c, but adding the option --l2-regularize 0.0001. + +# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000. + +# _4a is as _3s, but using narrower splice-indexes in the first layer. + +# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400. +# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p. +# This of course reduces overtraining. Results are a bit better than 3p but still +# not as good as 2y + +# ./show_wer.sh 3s +# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 3p +# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0 +# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0 +# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys +# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 2y +# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + + +# _3r is as _3p but reducing the number of parameters as it seemed to be +# overtraining (despite already being quite a small model): [600,1800 -> +# 500,1500]. Also in the interim there was a script change to +# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change. +# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent +# with the halving of the minibatch size.] + + +# _3p is the same as 3o, but after a code and script change so we can use +# natural gradient for the RepeatedAffineComponent. +# [natural gradient was helpful, based on logs; +# also made a change to use positive bias for the jesus-component affine parts.] + +# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2. + +# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support +# recurrence, with improvements to the learning of the jesus layers. + +# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found +# to be worse. +# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence +# is helpful.] +#./show_wer.sh 3g +#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys +#a03:s5c: ./show_wer.sh 2y +#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +#a03:s5c: ./show_wer.sh 3d +#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0 +#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +# _3f is as _3e, but modifying the splicing setup to add (left) recurrence: +# added the :3's in --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" +# Therefore it's +# no longer really a tdnn, more like an RNN combined with TDNN. BTW, I'm not re-dumping egs with extra +# context, and this isn't really ideal - I want to see if this seems promising first. + +# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default) +# to 200 in order to reduce computation in the Jesus layer. + +# _3d is as _2y, and re-using the egs, but using --jesus-opts and +# configs from make_jesus_configs.py. +# --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ +# --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" + +# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from +# 800k to 1.2 million. The aim is to avoid some of the per-job overhead +# (model-averaging, etc.), since each iteration takes only a minute or so. +# I added the results to the table below. It seems the same on average- +# which is good. We'll probably keep this configuration. + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# note: when I compare with the rerun of 2o (not shown), this run is actually +# better. +# WER on 2m 2o 2y [ now comparing 2o->2y:] +# train_dev,tg 17.22 17.24 16.99 0.2% better +# train_dev,fg 15.87 15.93 15.86 0.1% better +# eval2000,tg 18.7 18.7 18.9 0.2% worse +# eval2000,fg 17.0 16.9 17.0 0.1% worse + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=14 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_6k # Note: _sp will get added to this if $speed_perturb == true. + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=2.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=200 # 20 is equivalent to 150 at 10ms frame rate. +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +# Generate faster-frame-rate (7.5 ms frame shift) version of the data. +if [ $stage -le 12 ]; then + mfccdir=mfcc + for dataset in eval2000 train_dev ${train_set}; do + utils/copy_data_dir.sh data/$dataset data/${dataset}_hiresf + steps/make_mfcc.sh --cmd "$train_cmd" --nj 30 --mfcc-config conf/mfcc_hiresf.conf \ + data/${dataset}_hiresf exp/make_hiresf/$dataset $mfccdir; + steps/compute_cmvn_stats.sh data/${dataset}_hiresf exp/make_hiresf/$dataset $mfccdir; + utils/fix_data_dir.sh data/${dataset}_hiresf # remove segments with problems + done +fi + +if [ $stage -le 13 ]; then + for dataset in eval2000 train_dev ${train_set}; do + mkdir -p exp/nnet3/ivectors_${dataset}_fake + cp exp/nnet3/ivectors_${dataset}/ivector_online.scp exp/nnet3/ivectors_${dataset}_fake + # verify that the old ivector_period was 10. + [ $(cat exp/nnet3/ivectors_${dataset}/ivector_period) -eq 10 ] || exit 1 + # the ivector_period would have to be 13.333 to get the exact same rate. + # set it to 14 (slightly over) as less likely to produce errors in decoding. + echo 14 > exp/nnet3/ivectors_${dataset}_fake/ivector_period + done + # for the training set, use 13 as the ivector_period... this avoids + # errors for some longer utterances (the code checks the matching + # in a slightly different way). none of this would be necessary + # if we generated iVectors using the same frame shift. + echo 13 > exp/nnet3/ivectors_${train_set}_fake/ivector_period +fi + +if [ $stage -le 14 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --egs-dir exp/chain/tdnn_6i_sp/egs \ + --frame-subsampling-factor 4 \ + --alignment-subsampling-factor 3 \ + --xent-regularize 0.1 \ + --leaky-hmm-coefficient 0.1 \ + --l2-regularize 0.00005 \ + --jesus-opts "--jesus-forward-input-dim 600 --jesus-forward-output-dim 1700 --jesus-hidden-dim 0 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25 --self-repair-scale 0.00001 --xent-separate-forward-affine=true" \ + --splice-indexes "-1,0,1 -2,-1,0,1,2 -4,-2,0,2 -4,0,4 -4,0,4 -4,0,4" \ + --apply-deriv-weights false \ + --frames-per-iter 2000000 \ + --online-ivector-dir exp/nnet3/ivectors_${train_set}_fake \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --feat-type raw \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hiresf $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; + + echo "0.0075" > $dir/frame_shift # this lets the sclite decoding script know + # what the frame shift was, in seconds. +fi + +if [ $stage -le 15 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 16 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set}_fake \ + $graph_dir data/${decode_set}_hiresf $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hiresf \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_6l.sh b/egs/swbd/s5c/local/chain/run_tdnn_6l.sh new file mode 100755 index 00000000000..f1e0821f2cf --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_6l.sh @@ -0,0 +1,521 @@ +#!/bin/bash + +# _6l is as _6i, but adding the option --xent-separate-forward-affine=true which +# I had accidentally omitted, and adding 4 frames more left context and 2 frames +# more right context. + +# Below I'm also comparing with 6h, which (since we now added +# --xent-separate-forward-affine=true) is the appopriate normal-frame-rate +# baseline, rather than 6g. + +# This experiment is better than 6i, but there is no clear difference with +# 6h. So we can't really say that we're getting any benefit from the higher +# frame rate. + +#local/chain/compare_wer.sh 6h 6i 6l +#System 6h 6i 6l +#WER on train_dev(tg) 15.46 15.62 15.42 +#WER on train_dev(fg) 14.28 14.46 14.25 +#WER on eval2000(tg) 17.4 17.3 17.3 +#WER on eval2000(fg) 15.7 15.8 15.8 +#Final train prob -0.105663 -0.10417-0.0984719 +#Final valid prob -0.130166 -0.123985 -0.119088 +#Final train prob (xent) -1.42483 -1.60566 -1.46581 +#Final valid prob (xent) -1.49792 -1.67945 -1.51644 + + +# _6i takes aspects from 5n and 6g. Like 6g it uses a 'thin' jesus-layer +# (no hidden dimension), and like 5n it uses a non-standard frame shift at the +# input, but this frame shift is 7.5 ms rather than 5ms (5n) or 10ms (6h). +# the idea is that this allows us to subsample the input frames by a factor +# of 4, rather than 3, and since 4 = 2 * 2, we can do the subsampling +# in two stages. You'll see this reflected in the splice indexes. + +# local/chain/compare_wer.sh 6g 6i +# System 6g 6i +# WER on train_dev(tg) 15.50 15.62 +# WER on train_dev(fg) 14.31 14.46 +# WER on eval2000(tg) 17.5 17.3 +# WER on eval2000(fg) 15.8 15.8 +# Final train prob -0.105853 -0.10417 +# Final valid prob -0.129997 -0.123985 + +# _5n is as _5j (also omitting the iVectors), but using double the input frame +# rate from 10 to 5 ms (and reducing frame width from 25 to 20), and modifying +# the splice indexes accordingly. +# note: the frames-per-iter should be 1.6 million to get the same amount of +# data per iteration, but I'm making it 2 million as the training per is getting +# faster than I like (-> wasting time waiting for the queue). + +# A very nice improvement on dev; small improvement on eval2000 though. +#local/chain/compare_wer.sh 5j 5n +#System 5j 5n +#WER on train_dev(tg) 17.59 16.85 +#WER on train_dev(fg) 16.33 15.67 +#WER on eval2000(tg) 19.1 19.1 +#WER on eval2000(fg) 17.5 17.3 +#Final train prob -0.114691 -0.116341 +#Final valid prob -0.130761 -0.130884 + +# _5j is as _5e, but omitting the iVectors. + +# Definitely worse, although curiously, there is very little effect on the valid prob. +#./compare_wer.sh 5e 5j +#System 5e 5j +#WER on train_dev(tg) 15.43 17.59 +#WER on train_dev(fg) 14.32 16.33 +#WER on eval2000(tg) 17.3 19.1 +#WER on eval2000(fg) 15.5 17.5 +#Final train prob -0.110056 -0.114691 +#Final valid prob -0.129184 -0.130761 + + +# _5e is as _5b, but reducing --xent-regularize from 0.2 to 0.1 (since based on +# the results of 4v, 4w and 5c, it looks like 0.1 is better than 0.2 or 0.05). + +# The improvement is small but consistent (0.1, 0.1, 0.0, 0.1) and also seen +# in the train and valid probs. +#System 5b 5e +#WER on train_dev(tg) 15.51 15.43 +#WER on train_dev(fg) 14.39 14.32 +#WER on eval2000(tg) 17.3 17.3 +#WER on eval2000(fg) 15.6 15.5 +#Final train prob -0.112013 -0.110056 +#Final valid prob -0.130879 -0.129184 + +# _5b is as _5a, but adding --leaky-hmm-coefficient 0.1. + +# It does seem helpful on average: (-0.35, -0.35, -0.1, 0). +#./compare_wer.sh 5a 5b +#System 5a 5b +#WER on train_dev(tg) 15.86 15.51 +#WER on train_dev(fg) 14.74 14.39 +#WER on eval2000(tg) 17.4 17.3 +#WER on eval2000(fg) 15.6 15.6 +#Final train prob -0.0998359 -0.112013 +#Final valid prob -0.115884 -0.130879 + +# _5a is as _4w, but increasing jesus-forward-output-dim from 1400 to 1800, and +# jesus-forward-input-dim from 400 to 500. Hoping that the cross-entropy regularization +# will mean that the increased parameters are now helpful. + +# _4w is as _4v, but doubling --xent-regularize to 0.2 + +# _4v is as _4r, but with --xent-regularize 0.1. Increasing max_param_change +# from 1.0 to 2.0 because there is a lot of parameter change in the final xent +# layer, and this limits the rate of change of the other layers. + +# _4r is as _4f, but one more hidden layer, and reducing context of existing +# layers so we can re-use the egs. Reducing jesus-forward-output-dim slightly +# from 1500 to 1400. + +# This is better than 4f by almost all metrics. +# ./compare_wer.sh 4f 4r +# System 4f 4r +# WER on train_dev(tg) 16.83 16.50 +# WER on train_dev(fg) 15.73 15.45 +# WER on eval2000(tg) 18.4 18.3 +# WER on eval2000(fg) 16.6 16.7 +# Final train prob -0.105832 -0.103652 +# Final valid prob -0.123021 -0.121105 + +# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005. + +# It's even better than 4e, by about 0.3% abs. +# 4c 4e 4f +# Final valid prob: -0.1241 -0.1267 -0.1230 +# Final train prob: -0.08820 -0.1149 -0.1058 + +# ./show_wer.sh 4f +# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0 +# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 4e +# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0 +# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0 +# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys + + +# _4e is as _4c, but adding the option --l2-regularize 0.0001. + +# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000. + +# _4a is as _3s, but using narrower splice-indexes in the first layer. + +# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400. +# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p. +# This of course reduces overtraining. Results are a bit better than 3p but still +# not as good as 2y + +# ./show_wer.sh 3s +# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 3p +# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0 +# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0 +# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys +# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 2y +# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + + +# _3r is as _3p but reducing the number of parameters as it seemed to be +# overtraining (despite already being quite a small model): [600,1800 -> +# 500,1500]. Also in the interim there was a script change to +# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change. +# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent +# with the halving of the minibatch size.] + + +# _3p is the same as 3o, but after a code and script change so we can use +# natural gradient for the RepeatedAffineComponent. +# [natural gradient was helpful, based on logs; +# also made a change to use positive bias for the jesus-component affine parts.] + +# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2. + +# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support +# recurrence, with improvements to the learning of the jesus layers. + +# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found +# to be worse. +# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence +# is helpful.] +#./show_wer.sh 3g +#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys +#a03:s5c: ./show_wer.sh 2y +#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +#a03:s5c: ./show_wer.sh 3d +#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0 +#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +# _3f is as _3e, but modifying the splicing setup to add (left) recurrence: +# added the :3's in --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" +# Therefore it's +# no longer really a tdnn, more like an RNN combined with TDNN. BTW, I'm not re-dumping egs with extra +# context, and this isn't really ideal - I want to see if this seems promising first. + +# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default) +# to 200 in order to reduce computation in the Jesus layer. + +# _3d is as _2y, and re-using the egs, but using --jesus-opts and +# configs from make_jesus_configs.py. +# --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ +# --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" + +# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from +# 800k to 1.2 million. The aim is to avoid some of the per-job overhead +# (model-averaging, etc.), since each iteration takes only a minute or so. +# I added the results to the table below. It seems the same on average- +# which is good. We'll probably keep this configuration. + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# note: when I compare with the rerun of 2o (not shown), this run is actually +# better. +# WER on 2m 2o 2y [ now comparing 2o->2y:] +# train_dev,tg 17.22 17.24 16.99 0.2% better +# train_dev,fg 15.87 15.93 15.86 0.1% better +# eval2000,tg 18.7 18.7 18.9 0.2% worse +# eval2000,fg 17.0 16.9 17.0 0.1% worse + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=14 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_6l # Note: _sp will get added to this if $speed_perturb == true. + +# training options +num_epochs=3 # this is about the same amount of compute as the normal 4, since + # epoch encompasses all frame-shifts of the data and we now have 4 + # frames-shifts rather than 3. (3 * 4 == 4 * 3). +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=2.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=200 # 20 is equivalent to 150 at 10ms frame rate. +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +# Generate faster-frame-rate (7.5 ms frame shift) version of the data. +if [ $stage -le 12 ]; then + mfccdir=mfcc + for dataset in eval2000 train_dev ${train_set}; do + utils/copy_data_dir.sh data/$dataset data/${dataset}_hiresf + steps/make_mfcc.sh --cmd "$train_cmd" --nj 30 --mfcc-config conf/mfcc_hiresf.conf \ + data/${dataset}_hiresf exp/make_hiresf/$dataset $mfccdir; + steps/compute_cmvn_stats.sh data/${dataset}_hiresf exp/make_hiresf/$dataset $mfccdir; + utils/fix_data_dir.sh data/${dataset}_hiresf # remove segments with problems + done +fi + +if [ $stage -le 13 ]; then + for dataset in eval2000 train_dev ${train_set}; do + mkdir -p exp/nnet3/ivectors_${dataset}_fake + cp exp/nnet3/ivectors_${dataset}/ivector_online.scp exp/nnet3/ivectors_${dataset}_fake + # verify that the old ivector_period was 10. + [ $(cat exp/nnet3/ivectors_${dataset}/ivector_period) -eq 10 ] || exit 1 + # the ivector_period would have to be 13.333 to get the exact same rate. + # set it to 14 (slightly over) as less likely to produce errors in decoding. + echo 14 > exp/nnet3/ivectors_${dataset}_fake/ivector_period + done + # for the training set, use 13 as the ivector_period... this avoids + # errors for some longer utterances (the code checks the matching + # in a slightly different way). none of this would be necessary + # if we generated iVectors using the same frame shift. + echo 13 > exp/nnet3/ivectors_${train_set}_fake/ivector_period +fi + +if [ $stage -le 14 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b{05,b11,b12,b13}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --frame-subsampling-factor 4 \ + --alignment-subsampling-factor 3 \ + --xent-regularize 0.1 \ + --leaky-hmm-coefficient 0.1 \ + --l2-regularize 0.00005 \ + --jesus-opts "--jesus-forward-input-dim 600 --jesus-forward-output-dim 1700 --jesus-hidden-dim 0 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25 --self-repair-scale 0.00001 --xent-separate-forward-affine=true" \ + --splice-indexes "-1,0,1 -2,-1,0,1,2 -4,-2,0,2,4 -4,0,4 -4,0,4 -8,-4,0,4" \ + --apply-deriv-weights false \ + --frames-per-iter 2000000 \ + --online-ivector-dir exp/nnet3/ivectors_${train_set}_fake \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --feat-type raw \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hiresf $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; + + echo "0.0075" > $dir/frame_shift # this lets the sclite decoding script know + # what the frame shift was, in seconds. +fi + +if [ $stage -le 15 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 16 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set}_fake \ + $graph_dir data/${decode_set}_hiresf $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hiresf \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_6m.sh b/egs/swbd/s5c/local/chain/run_tdnn_6m.sh new file mode 100755 index 00000000000..8a7b14ef342 --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_6m.sh @@ -0,0 +1,497 @@ +#!/bin/bash + +# _6m is as _6j (which subsamples by 4 frames not 3 at the output), changing just the +# --left-tolerance and --right-tolerance to be the same total width but more +# symmetrical (-7,+8) vs the default (-5, +10). + +# this is unhelpful and if anything is a little worse. +#local/chain/compare_wer.sh 6j 6m +#System 6j 6m +#WER on train_dev(tg) 15.86 16.08 +#WER on train_dev(fg) 14.79 14.85 +#WER on eval2000(tg) 17.6 17.6 +#WER on eval2000(fg) 15.8 15.8 +#Final train prob -0.131444 -0.131515 +#Final valid prob -0.167574 -0.17046 +#Final train prob (xent) -1.45908 -1.43814 +#Final valid prob (xent) -1.55937 -1.5412 + +# _6j is another baseline for _6i, in which we use regular features (10 ms frame +# shift) with the 4-fold subsampling of 6i. I don't expect this will be as +# good, but it will be nice to have confirmation that the lower sampling +# rate is actually helpful. +# reducing frames-per-eg from 200 to 150 and --frames-per-iter from +# 2 million to 1.5 million. + +# Hm- the difference is surprisingly small, about 0.2% worse on average. +#local/chain/compare_wer.sh 6i 6j +#System 6i 6j +#WER on train_dev(tg) 15.62 15.86 +#WER on train_dev(fg) 14.46 14.79 +#WER on eval2000(tg) 17.3 17.6 +#WER on eval2000(fg) 15.8 15.8 +#Final train prob -0.10417 -0.131444 +#Final valid prob -0.123985 -0.167574 + +# _6i takes aspects from 5n and 6h. Like 6h it uses a 'thin' jesus-layer +# (no hidden dimension), and like 5n it uses a non-standard frame shift at the +# input, but this frame shift is 7.5 ms rather than 5ms (5n) or 10ms (6h). +# the idea is that this allows us to subsample the input frames by a factor +# of 4, rather than 3, and since 4 = 2 * 2, we can do the subsampling +# in two stages. You'll see this reflected in the splice indexes. +# Some notes: +# - we had the choose the splice indexes; we have 1 hidden layer at +# base frame rate, 2 at + +# _5n is as _5j (also omitting the iVectors), but using double the input frame +# rate from 10 to 5 ms (and reducing frame width from 25 to 20), and modifying +# the splice indexes accordingly. +# note: the frames-per-iter should be 1.6 million to get the same amount of +# data per iteration, but I'm making it 2 million as the training per is getting +# faster than I like (-> wasting time waiting for the queue). + +# A very nice improvement on dev; small improvement on eval2000 though. +#local/chain/compare_wer.sh 5j 5n +#System 5j 5n +#WER on train_dev(tg) 17.59 16.85 +#WER on train_dev(fg) 16.33 15.67 +#WER on eval2000(tg) 19.1 19.1 +#WER on eval2000(fg) 17.5 17.3 +#Final train prob -0.114691 -0.116341 +#Final valid prob -0.130761 -0.130884 + +# _5j is as _5e, but omitting the iVectors. + +# Definitely worse, although curiously, there is very little effect on the valid prob. +#./compare_wer.sh 5e 5j +#System 5e 5j +#WER on train_dev(tg) 15.43 17.59 +#WER on train_dev(fg) 14.32 16.33 +#WER on eval2000(tg) 17.3 19.1 +#WER on eval2000(fg) 15.5 17.5 +#Final train prob -0.110056 -0.114691 +#Final valid prob -0.129184 -0.130761 + + +# _5e is as _5b, but reducing --xent-regularize from 0.2 to 0.1 (since based on +# the results of 4v, 4w and 5c, it looks like 0.1 is better than 0.2 or 0.05). + +# The improvement is small but consistent (0.1, 0.1, 0.0, 0.1) and also seen +# in the train and valid probs. +#System 5b 5e +#WER on train_dev(tg) 15.51 15.43 +#WER on train_dev(fg) 14.39 14.32 +#WER on eval2000(tg) 17.3 17.3 +#WER on eval2000(fg) 15.6 15.5 +#Final train prob -0.112013 -0.110056 +#Final valid prob -0.130879 -0.129184 + +# _5b is as _5a, but adding --leaky-hmm-coefficient 0.1. + +# It does seem helpful on average: (-0.35, -0.35, -0.1, 0). +#./compare_wer.sh 5a 5b +#System 5a 5b +#WER on train_dev(tg) 15.86 15.51 +#WER on train_dev(fg) 14.74 14.39 +#WER on eval2000(tg) 17.4 17.3 +#WER on eval2000(fg) 15.6 15.6 +#Final train prob -0.0998359 -0.112013 +#Final valid prob -0.115884 -0.130879 + +# _5a is as _4w, but increasing jesus-forward-output-dim from 1400 to 1800, and +# jesus-forward-input-dim from 400 to 500. Hoping that the cross-entropy regularization +# will mean that the increased parameters are now helpful. + +# _4w is as _4v, but doubling --xent-regularize to 0.2 + +# _4v is as _4r, but with --xent-regularize 0.1. Increasing max_param_change +# from 1.0 to 2.0 because there is a lot of parameter change in the final xent +# layer, and this limits the rate of change of the other layers. + +# _4r is as _4f, but one more hidden layer, and reducing context of existing +# layers so we can re-use the egs. Reducing jesus-forward-output-dim slightly +# from 1500 to 1400. + +# This is better than 4f by almost all metrics. +# ./compare_wer.sh 4f 4r +# System 4f 4r +# WER on train_dev(tg) 16.83 16.50 +# WER on train_dev(fg) 15.73 15.45 +# WER on eval2000(tg) 18.4 18.3 +# WER on eval2000(fg) 16.6 16.7 +# Final train prob -0.105832 -0.103652 +# Final valid prob -0.123021 -0.121105 + +# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005. + +# It's even better than 4e, by about 0.3% abs. +# 4c 4e 4f +# Final valid prob: -0.1241 -0.1267 -0.1230 +# Final train prob: -0.08820 -0.1149 -0.1058 + +# ./show_wer.sh 4f +# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0 +# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 4e +# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0 +# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0 +# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys + + +# _4e is as _4c, but adding the option --l2-regularize 0.0001. + +# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000. + +# _4a is as _3s, but using narrower splice-indexes in the first layer. + +# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400. +# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p. +# This of course reduces overtraining. Results are a bit better than 3p but still +# not as good as 2y + +# ./show_wer.sh 3s +# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 3p +# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0 +# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0 +# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys +# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 2y +# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + + +# _3r is as _3p but reducing the number of parameters as it seemed to be +# overtraining (despite already being quite a small model): [600,1800 -> +# 500,1500]. Also in the interim there was a script change to +# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change. +# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent +# with the halving of the minibatch size.] + + +# _3p is the same as 3o, but after a code and script change so we can use +# natural gradient for the RepeatedAffineComponent. +# [natural gradient was helpful, based on logs; +# also made a change to use positive bias for the jesus-component affine parts.] + +# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2. + +# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support +# recurrence, with improvements to the learning of the jesus layers. + +# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found +# to be worse. +# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence +# is helpful.] +#./show_wer.sh 3g +#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys +#a03:s5c: ./show_wer.sh 2y +#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +#a03:s5c: ./show_wer.sh 3d +#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0 +#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +# _3f is as _3e, but modifying the splicing setup to add (left) recurrence: +# added the :3's in --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" +# Therefore it's +# no longer really a tdnn, more like an RNN combined with TDNN. BTW, I'm not re-dumping egs with extra +# context, and this isn't really ideal - I want to see if this seems promising first. + +# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default) +# to 200 in order to reduce computation in the Jesus layer. + +# _3d is as _2y, and re-using the egs, but using --jesus-opts and +# configs from make_jesus_configs.py. +# --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ +# --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" + +# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from +# 800k to 1.2 million. The aim is to avoid some of the per-job overhead +# (model-averaging, etc.), since each iteration takes only a minute or so. +# I added the results to the table below. It seems the same on average- +# which is good. We'll probably keep this configuration. + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# note: when I compare with the rerun of 2o (not shown), this run is actually +# better. +# WER on 2m 2o 2y [ now comparing 2o->2y:] +# train_dev,tg 17.22 17.24 16.99 0.2% better +# train_dev,fg 15.87 15.93 15.86 0.1% better +# eval2000,tg 18.7 18.7 18.9 0.2% worse +# eval2000,fg 17.0 16.9 17.0 0.1% worse + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_6m # Note: _sp will get added to this if $speed_perturb == true. + +# training options +num_epochs=3 # this is about the same amount of compute as the normal 4, since + # epoch encompasses all frame-shifts of the data and we now have 4 + # frames-shifts rather than 3. (3 * 4 == 4 * 3). +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=2.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 4 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --left-tolerance 7 --right-tolerance 8 \ + --frame-subsampling-factor 4 \ + --alignment-subsampling-factor 4 \ + --xent-regularize 0.1 \ + --leaky-hmm-coefficient 0.1 \ + --l2-regularize 0.00005 \ + --jesus-opts "--jesus-forward-input-dim 600 --jesus-forward-output-dim 1700 --jesus-hidden-dim 0 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25 --self-repair-scale 0.00001 --xent-separate-forward-affine=true" \ + --splice-indexes "-1,0,1 -2,-1,0,1,2 -4,-2,0,2 -4,0,4 -4,0,4 -4,0,4" \ + --apply-deriv-weights false \ + --frames-per-iter 1500000 \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --feat-type raw \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; + + echo "0.0075" > $dir/frame_shift # this lets the sclite decoding script know + # what the frame shift was, in seconds. +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_6n.sh b/egs/swbd/s5c/local/chain/run_tdnn_6n.sh new file mode 100755 index 00000000000..625cb73cf50 --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_6n.sh @@ -0,0 +1,499 @@ +#!/bin/bash + +# _6n is as _6m, but with a less-wide splicing context. + +# The effect is inconsistent- there is none, on average. +#System 6j 6m 6n +#WER on train_dev(tg) 15.86 16.08 16.01 +#WER on train_dev(fg) 14.79 14.85 14.66 +#WER on eval2000(tg) 17.6 17.6 17.7 +#WER on eval2000(fg) 15.8 15.8 15.9 +#Final train prob -0.131444 -0.131515 -0.133681 +#Final valid prob -0.167574 -0.17046 -0.172072 +#Final train prob (xent) -1.45908 -1.43814 -1.53108 +#Final valid prob (xent) -1.55937 -1.5412 -1.65137 + +# _6m is as _6j (which subsamples by 4 frames), changing just the +# --left-tolerance and --right-tolerance to be the same total width but more +# symmetrical (-7,+8) vs the default (-5, +10). + +# _6j is another baseline for _6i, in which we use regular features (10 ms frame +# shift) with the 4-fold subsampling of 6i. I don't expect this will be as +# good, but it will be nice to have confirmation that the lower sampling +# rate is actually helpful. +# reducing frames-per-eg from 200 to 150 and --frames-per-iter from +# 2 million to 1.5 million. + +# Hm- the difference is surprisingly small, about 0.2% worse on average. +#local/chain/compare_wer.sh 6i 6j +#System 6i 6j +#WER on train_dev(tg) 15.62 15.86 +#WER on train_dev(fg) 14.46 14.79 +#WER on eval2000(tg) 17.3 17.6 +#WER on eval2000(fg) 15.8 15.8 +#Final train prob -0.10417 -0.131444 +#Final valid prob -0.123985 -0.167574 + +# _6i takes aspects from 5n and 6h. Like 6h it uses a 'thin' jesus-layer +# (no hidden dimension), and like 5n it uses a non-standard frame shift at the +# input, but this frame shift is 7.5 ms rather than 5ms (5n) or 10ms (6h). +# the idea is that this allows us to subsample the input frames by a factor +# of 4, rather than 3, and since 4 = 2 * 2, we can do the subsampling +# in two stages. You'll see this reflected in the splice indexes. +# Some notes: +# - we had the choose the splice indexes; we have 1 hidden layer at +# base frame rate, 2 at + +# _5n is as _5j (also omitting the iVectors), but using double the input frame +# rate from 10 to 5 ms (and reducing frame width from 25 to 20), and modifying +# the splice indexes accordingly. +# note: the frames-per-iter should be 1.6 million to get the same amount of +# data per iteration, but I'm making it 2 million as the training per is getting +# faster than I like (-> wasting time waiting for the queue). + +# A very nice improvement on dev; small improvement on eval2000 though. +#local/chain/compare_wer.sh 5j 5n +#System 5j 5n +#WER on train_dev(tg) 17.59 16.85 +#WER on train_dev(fg) 16.33 15.67 +#WER on eval2000(tg) 19.1 19.1 +#WER on eval2000(fg) 17.5 17.3 +#Final train prob -0.114691 -0.116341 +#Final valid prob -0.130761 -0.130884 + +# _5j is as _5e, but omitting the iVectors. + +# Definitely worse, although curiously, there is very little effect on the valid prob. +#./compare_wer.sh 5e 5j +#System 5e 5j +#WER on train_dev(tg) 15.43 17.59 +#WER on train_dev(fg) 14.32 16.33 +#WER on eval2000(tg) 17.3 19.1 +#WER on eval2000(fg) 15.5 17.5 +#Final train prob -0.110056 -0.114691 +#Final valid prob -0.129184 -0.130761 + + +# _5e is as _5b, but reducing --xent-regularize from 0.2 to 0.1 (since based on +# the results of 4v, 4w and 5c, it looks like 0.1 is better than 0.2 or 0.05). + +# The improvement is small but consistent (0.1, 0.1, 0.0, 0.1) and also seen +# in the train and valid probs. +#System 5b 5e +#WER on train_dev(tg) 15.51 15.43 +#WER on train_dev(fg) 14.39 14.32 +#WER on eval2000(tg) 17.3 17.3 +#WER on eval2000(fg) 15.6 15.5 +#Final train prob -0.112013 -0.110056 +#Final valid prob -0.130879 -0.129184 + +# _5b is as _5a, but adding --leaky-hmm-coefficient 0.1. + +# It does seem helpful on average: (-0.35, -0.35, -0.1, 0). +#./compare_wer.sh 5a 5b +#System 5a 5b +#WER on train_dev(tg) 15.86 15.51 +#WER on train_dev(fg) 14.74 14.39 +#WER on eval2000(tg) 17.4 17.3 +#WER on eval2000(fg) 15.6 15.6 +#Final train prob -0.0998359 -0.112013 +#Final valid prob -0.115884 -0.130879 + +# _5a is as _4w, but increasing jesus-forward-output-dim from 1400 to 1800, and +# jesus-forward-input-dim from 400 to 500. Hoping that the cross-entropy regularization +# will mean that the increased parameters are now helpful. + +# _4w is as _4v, but doubling --xent-regularize to 0.2 + +# _4v is as _4r, but with --xent-regularize 0.1. Increasing max_param_change +# from 1.0 to 2.0 because there is a lot of parameter change in the final xent +# layer, and this limits the rate of change of the other layers. + +# _4r is as _4f, but one more hidden layer, and reducing context of existing +# layers so we can re-use the egs. Reducing jesus-forward-output-dim slightly +# from 1500 to 1400. + +# This is better than 4f by almost all metrics. +# ./compare_wer.sh 4f 4r +# System 4f 4r +# WER on train_dev(tg) 16.83 16.50 +# WER on train_dev(fg) 15.73 15.45 +# WER on eval2000(tg) 18.4 18.3 +# WER on eval2000(fg) 16.6 16.7 +# Final train prob -0.105832 -0.103652 +# Final valid prob -0.123021 -0.121105 + +# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005. + +# It's even better than 4e, by about 0.3% abs. +# 4c 4e 4f +# Final valid prob: -0.1241 -0.1267 -0.1230 +# Final train prob: -0.08820 -0.1149 -0.1058 + +# ./show_wer.sh 4f +# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0 +# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 4e +# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0 +# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0 +# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys + + +# _4e is as _4c, but adding the option --l2-regularize 0.0001. + +# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000. + +# _4a is as _3s, but using narrower splice-indexes in the first layer. + +# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400. +# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p. +# This of course reduces overtraining. Results are a bit better than 3p but still +# not as good as 2y + +# ./show_wer.sh 3s +# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 3p +# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0 +# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0 +# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys +# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 2y +# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + + +# _3r is as _3p but reducing the number of parameters as it seemed to be +# overtraining (despite already being quite a small model): [600,1800 -> +# 500,1500]. Also in the interim there was a script change to +# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change. +# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent +# with the halving of the minibatch size.] + + +# _3p is the same as 3o, but after a code and script change so we can use +# natural gradient for the RepeatedAffineComponent. +# [natural gradient was helpful, based on logs; +# also made a change to use positive bias for the jesus-component affine parts.] + +# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2. + +# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support +# recurrence, with improvements to the learning of the jesus layers. + +# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found +# to be worse. +# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence +# is helpful.] +#./show_wer.sh 3g +#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys +#a03:s5c: ./show_wer.sh 2y +#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +#a03:s5c: ./show_wer.sh 3d +#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0 +#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +# _3f is as _3e, but modifying the splicing setup to add (left) recurrence: +# added the :3's in --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" +# Therefore it's +# no longer really a tdnn, more like an RNN combined with TDNN. BTW, I'm not re-dumping egs with extra +# context, and this isn't really ideal - I want to see if this seems promising first. + +# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default) +# to 200 in order to reduce computation in the Jesus layer. + +# _3d is as _2y, and re-using the egs, but using --jesus-opts and +# configs from make_jesus_configs.py. +# --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ +# --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" + +# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from +# 800k to 1.2 million. The aim is to avoid some of the per-job overhead +# (model-averaging, etc.), since each iteration takes only a minute or so. +# I added the results to the table below. It seems the same on average- +# which is good. We'll probably keep this configuration. + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# note: when I compare with the rerun of 2o (not shown), this run is actually +# better. +# WER on 2m 2o 2y [ now comparing 2o->2y:] +# train_dev,tg 17.22 17.24 16.99 0.2% better +# train_dev,fg 15.87 15.93 15.86 0.1% better +# eval2000,tg 18.7 18.7 18.9 0.2% worse +# eval2000,fg 17.0 16.9 17.0 0.1% worse + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_6n # Note: _sp will get added to this if $speed_perturb == true. + +# training options +num_epochs=3 # this is about the same amount of compute as the normal 4, since + # epoch encompasses all frame-shifts of the data and we now have 4 + # frames-shifts rather than 3. (3 * 4 == 4 * 3). +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=2.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 4 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --egs-dir exp/chain/tdnn_6m_sp/egs \ + --left-tolerance 7 --right-tolerance 8 \ + --frame-subsampling-factor 4 \ + --alignment-subsampling-factor 4 \ + --xent-regularize 0.1 \ + --leaky-hmm-coefficient 0.1 \ + --l2-regularize 0.00005 \ + --jesus-opts "--jesus-forward-input-dim 600 --jesus-forward-output-dim 1700 --jesus-hidden-dim 0 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25 --self-repair-scale 0.00001 --xent-separate-forward-affine=true" \ + --splice-indexes "-1,0,1 -2,-1,0,1,2 -2,0,2 -2,0,2 -4,0,4 -4,0,4" \ + --apply-deriv-weights false \ + --frames-per-iter 1500000 \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --feat-type raw \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; + + echo "0.0075" > $dir/frame_shift # this lets the sclite decoding script know + # what the frame shift was, in seconds. +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_6o.sh b/egs/swbd/s5c/local/chain/run_tdnn_6o.sh new file mode 100755 index 00000000000..e07e6092644 --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_6o.sh @@ -0,0 +1,509 @@ +#!/bin/bash + +# _6o is as _6h but halving the --l2-regularize option, because since the +# time we last tuned this, other regularization methods have been added. + +#It's worse. +#local/chain/compare_wer.sh 6h 6o +#System 6h 6o +#WER on train_dev(tg) 15.46 15.61 +#WER on train_dev(fg) 14.28 14.58 +#WER on eval2000(tg) 17.4 17.5 +#WER on eval2000(fg) 15.7 15.7 +#Final train prob -0.105663-0.0992526 +#Final valid prob -0.130166 -0.127421 +#Final train prob (xent) -1.42483 -1.4369 +#Final valid prob (xent) -1.49792 -1.49867 + +# _6h is as _6g but adding --xent-separate-forward-affine=true, which +# gives a separate last-but-one weight matrix to the xent output. + +# Although this slight improvement is probably not significant, it's a +# sensible idea so I think I'll stick with it. +#local/chain/compare_wer.sh 6g 6h +#System 6g 6h +#WER on train_dev(tg) 15.50 15.46 +#WER on train_dev(fg) 14.31 14.28 +#WER on eval2000(tg) 17.5 17.4 +#WER on eval2000(fg) 15.8 15.7 +#Final train prob -0.105853 -0.105663 +#Final valid prob -0.129997 -0.130166 + +# _6g is as _6f but increasing the parameters (increasing +# jesus-forward-input-from from 500 to 600). + +# _6f is as _5v, but setting --jesus-hidden-dim to 0 which with a script change +# means there is no hidden part in the jesus layer (it's just repeated affine and relu). + +# slightly worse, but encouragingly small difference. +#local/chain/compare_wer.sh 5v 6f +#System 5v 6f +#WER on train_dev(tg) 15.38 15.71 +#WER on train_dev(fg) 14.39 14.50 +#WER on eval2000(tg) 17.4 17.5 +#WER on eval2000(fg) 15.7 15.9 +#Final train prob -0.11156 -0.111305 +#Final valid prob -0.131797 -0.131487 + +# _5v is as _5t, but further reducing the --jesus-hidden-dim from 3500 to 2500. + +# WER is almost the same, perhaps <0.1% worse; diagnostics are slightly worse. +# +#local/chain/compare_wer.sh 5e 5s 5t 5v +#System 5e 5s 5t 5v +#WER on train_dev(tg) 15.43 15.47 15.43 15.38 +#WER on train_dev(fg) 14.32 14.31 14.34 14.39 +#WER on eval2000(tg) 17.3 17.4 17.4 17.4 +#WER on eval2000(fg) 15.5 15.6 15.6 15.7 +#Final train prob -0.110056 -0.110928 -0.110752 -0.11156 +#Final valid prob -0.129184 -0.132139 -0.129123 -0.131797 + +# _5t is as _5s but further reducing the jesus-hidden-dim (trying to speed it +# up), from 5000 to 3500. + +# about 5s: comparing with 5e which is the most recent baseline we actually +# decoded, 5s is as 5e but with jesus-forward-output-dim reduced 1800->1700, +# jesus-hidden-dim reduced 7500 to 5000, and and the new option +# --self-repair-scale 0.00001 added. Also compare 5t and 5v which have even +# smaller jesus-hidden-dims. + +# _5s is as _5r but increasing the jesus-forward-output-dim to the intermediate +# value of 1700 (between 1500 and 1800), and also a bug-fix in the self-repair +# code to a bug which was doubling the thresholds so there was, in effect, +# no upper threshold. I stopped the p,q,r runs after I found this, but in +# configuring this run I'm bearing in mind the train and valid probs from the +# p,q,r runs. + +# _5r is as _5q but also reducing --jesus-hidden-dim from 7500 to 5000. + +# _5q is as _5p but reducing jesus-forward-output-dim from 1800 to 1500 to try +# to compensate for the fact that more of the output dimensions are now being +# usefully used. + +# _5p is as _5e but adding (new option) --self-repair-scale 0.00001, to repair +# ReLUs that are over or under-saturated. + +# _5e is as _5b, but reducing --xent-regularize from 0.2 to 0.1 (since based on +# the results of 4v, 4w and 5c, it looks like 0.1 is better than 0.2 or 0.05). + +# The improvement is small but consistent (0.1, 0.1, 0.0, 0.1) and also seen +# in the train and valid probs. +#System 5b 5e +#WER on train_dev(tg) 15.51 15.43 +#WER on train_dev(fg) 14.39 14.32 +#WER on eval2000(tg) 17.3 17.3 +#WER on eval2000(fg) 15.6 15.5 +#Final train prob -0.112013 -0.110056 +#Final valid prob -0.130879 -0.129184 + +# _5b is as _5a, but adding --leaky-hmm-coefficient 0.1. + +# It does seem helpful on average: (-0.35, -0.35, -0.1, 0). +#./compare_wer.sh 5a 5b +#System 5a 5b +#WER on train_dev(tg) 15.86 15.51 +#WER on train_dev(fg) 14.74 14.39 +#WER on eval2000(tg) 17.4 17.3 +#WER on eval2000(fg) 15.6 15.6 +#Final train prob -0.0998359 -0.112013 +#Final valid prob -0.115884 -0.130879 + +# _5a is as _4w, but increasing jesus-forward-output-dim from 1400 to 1800, and +# jesus-forward-input-dim from 400 to 500. Hoping that the cross-entropy regularization +# will mean that the increased parameters are now helpful. +# quite helpful: +#local/chain/compare_wer.sh 4w 5a +#System 4w 5a +#WER on train_dev(tg) 16.05 15.86 +#WER on train_dev(fg) 14.92 14.74 +#WER on eval2000(tg) 18.0 17.4 +#WER on eval2000(fg) 16.2 15.6 +#Final train prob -0.108816-0.0998359 +#Final valid prob -0.118254 -0.115884 + +# _4w is as _4v, but doubling --xent-regularize to 0.2 + +# _4v is as _4r, but with --xent-regularize 0.1. Increasing max_param_change +# from 1.0 to 2.0 because there is a lot of parameter change in the final xent +# layer, and this limits the rate of change of the other layers. + +# _4r is as _4f, but one more hidden layer, and reducing context of existing +# layers so we can re-use the egs. Reducing jesus-forward-output-dim slightly +# from 1500 to 1400. + +# This is better than 4f by almost all metrics. +# ./compare_wer.sh 4f 4r +# System 4f 4r +# WER on train_dev(tg) 16.83 16.50 +# WER on train_dev(fg) 15.73 15.45 +# WER on eval2000(tg) 18.4 18.3 +# WER on eval2000(fg) 16.6 16.7 +# Final train prob -0.105832 -0.103652 +# Final valid prob -0.123021 -0.121105 + +# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005. + +# It's even better than 4e, by about 0.3% abs. +# 4c 4e 4f +# Final valid prob: -0.1241 -0.1267 -0.1230 +# Final train prob: -0.08820 -0.1149 -0.1058 + +# ./show_wer.sh 4f +# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0 +# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 4e +# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0 +# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0 +# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys + + +# _4e is as _4c, but adding the option --l2-regularize 0.0001. + +# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000. + +# _4a is as _3s, but using narrower splice-indexes in the first layer. + +# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400. +# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p. +# This of course reduces overtraining. Results are a bit better than 3p but still +# not as good as 2y + +# ./show_wer.sh 3s +# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 3p +# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0 +# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0 +# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys +# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 2y +# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + + +# _3r is as _3p but reducing the number of parameters as it seemed to be +# overtraining (despite already being quite a small model): [600,1800 -> +# 500,1500]. Also in the interim there was a script change to +# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change. +# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent +# with the halving of the minibatch size.] + + +# _3p is the same as 3o, but after a code and script change so we can use +# natural gradient for the RepeatedAffineComponent. +# [natural gradient was helpful, based on logs; +# also made a change to use positive bias for the jesus-component affine parts.] + +# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2. + +# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support +# recurrence, with improvements to the learning of the jesus layers. + +# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found +# to be worse. +# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence +# is helpful.] +#./show_wer.sh 3g +#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys +#a03:s5c: ./show_wer.sh 2y +#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +#a03:s5c: ./show_wer.sh 3d +#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0 +#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +# _3f is as _3e, but modifying the splicing setup to add (left) recurrence: +# added the :3's in --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" +# Therefore it's +# no longer really a tdnn, more like an RNN combined with TDNN. BTW, I'm not re-dumping egs with extra +# context, and this isn't really ideal - I want to see if this seems promising first. + +# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default) +# to 200 in order to reduce computation in the Jesus layer. + +# _3d is as _2y, and re-using the egs, but using --jesus-opts and +# configs from make_jesus_configs.py. +# --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ +# --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" + +# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from +# 800k to 1.2 million. The aim is to avoid some of the per-job overhead +# (model-averaging, etc.), since each iteration takes only a minute or so. +# I added the results to the table below. It seems the same on average- +# which is good. We'll probably keep this configuration. + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# note: when I compare with the rerun of 2o (not shown), this run is actually +# better. +# WER on 2m 2o 2y [ now comparing 2o->2y:] +# train_dev,tg 17.22 17.24 16.99 0.2% better +# train_dev,fg 15.87 15.93 15.86 0.1% better +# eval2000,tg 18.7 18.7 18.9 0.2% worse +# eval2000,fg 17.0 16.9 17.0 0.1% worse + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_6o # Note: _sp will get added to this if $speed_perturb == true. + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=2.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --xent-regularize 0.1 \ + --leaky-hmm-coefficient 0.1 \ + --l2-regularize 0.000025 \ + --egs-dir exp/chain/tdnn_2y_sp/egs \ + --jesus-opts "--jesus-forward-input-dim 600 --jesus-forward-output-dim 1700 --jesus-hidden-dim 0 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25 --self-repair-scale 0.00001 --xent-separate-forward-affine=true" \ + --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3 -3,0,3 -6,-3,0" \ + --apply-deriv-weights false \ + --frames-per-iter 1200000 \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context 20 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_6p.sh b/egs/swbd/s5c/local/chain/run_tdnn_6p.sh new file mode 100755 index 00000000000..a9f7eef9bbc --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_6p.sh @@ -0,0 +1,503 @@ +#!/bin/bash + +# _6p is as _6j, but increasing the various regularization coefficients. +# the intention is to increase them by 4/3, since they are all evaluated +# once per output frame, and there are now fewer output frames by a factor +# of 3/4. To make them rounder numbers, I increased some by a factor +# of 5/4 (--xent-regularize, 0.1 -> 0.125, and --leaky-hmm-coefficient, +# 0.1 -> 0.125), and l2-regularize by 3/2 (0.00005 -> 0.000075). + +# Worse. +#local/chain/compare_wer.sh 6j 6p +#System 6j 6p +#WER on train_dev(tg) 15.86 15.91 +#WER on train_dev(fg) 14.79 14.76 +#WER on eval2000(tg) 17.6 17.9 +#WER on eval2000(fg) 15.8 15.9 +#Final train prob -0.131444 -0.143285 +#Final valid prob -0.167574 -0.173759 +#Final train prob (xent) -1.45908 -1.44287 +#Final valid prob (xent) -1.55937 -1.52918 + + +# _6j is another baseline for _6i, in which we use regular features (10 ms frame +# shift) with the 4-fold subsampling of 6i. I don't expect this will be as +# good, but it will be nice to have confirmation that the lower sampling +# rate is actually helpful. +# reducing frames-per-eg from 200 to 150 and --frames-per-iter from +# 2 million to 1.5 million. + +# Hm- the difference is surprisingly small, about 0.2% worse on average. +#local/chain/compare_wer.sh 6i 6j +#System 6i 6j +#WER on train_dev(tg) 15.62 15.86 +#WER on train_dev(fg) 14.46 14.79 +#WER on eval2000(tg) 17.3 17.6 +#WER on eval2000(fg) 15.8 15.8 +#Final train prob -0.10417 -0.131444 +#Final valid prob -0.123985 -0.167574 +#Final train prob (xent) -1.60566 -1.45908 +#Final valid prob (xent) -1.67945 -1.55937 + +# _6i takes aspects from 5n and 6h. Like 6h it uses a 'thin' jesus-layer +# (no hidden dimension), and like 5n it uses a non-standard frame shift at the +# input, but this frame shift is 7.5 ms rather than 5ms (5n) or 10ms (6h). +# the idea is that this allows us to subsample the input frames by a factor +# of 4, rather than 3, and since 4 = 2 * 2, we can do the subsampling +# in two stages. You'll see this reflected in the splice indexes. +# Some notes: +# - we had the choose the splice indexes; we have 1 hidden layer at +# base frame rate, 2 at + +# _5n is as _5j (also omitting the iVectors), but using double the input frame +# rate from 10 to 5 ms (and reducing frame width from 25 to 20), and modifying +# the splice indexes accordingly. +# note: the frames-per-iter should be 1.6 million to get the same amount of +# data per iteration, but I'm making it 2 million as the training per is getting +# faster than I like (-> wasting time waiting for the queue). + +# A very nice improvement on dev; small improvement on eval2000 though. +#local/chain/compare_wer.sh 5j 5n +#System 5j 5n +#WER on train_dev(tg) 17.59 16.85 +#WER on train_dev(fg) 16.33 15.67 +#WER on eval2000(tg) 19.1 19.1 +#WER on eval2000(fg) 17.5 17.3 +#Final train prob -0.114691 -0.116341 +#Final valid prob -0.130761 -0.130884 + +# _5j is as _5e, but omitting the iVectors. + +# Definitely worse, although curiously, there is very little effect on the valid prob. +#./compare_wer.sh 5e 5j +#System 5e 5j +#WER on train_dev(tg) 15.43 17.59 +#WER on train_dev(fg) 14.32 16.33 +#WER on eval2000(tg) 17.3 19.1 +#WER on eval2000(fg) 15.5 17.5 +#Final train prob -0.110056 -0.114691 +#Final valid prob -0.129184 -0.130761 + + +# _5e is as _5b, but reducing --xent-regularize from 0.2 to 0.1 (since based on +# the results of 4v, 4w and 5c, it looks like 0.1 is better than 0.2 or 0.05). + +# The improvement is small but consistent (0.1, 0.1, 0.0, 0.1) and also seen +# in the train and valid probs. +#System 5b 5e +#WER on train_dev(tg) 15.51 15.43 +#WER on train_dev(fg) 14.39 14.32 +#WER on eval2000(tg) 17.3 17.3 +#WER on eval2000(fg) 15.6 15.5 +#Final train prob -0.112013 -0.110056 +#Final valid prob -0.130879 -0.129184 + +# _5b is as _5a, but adding --leaky-hmm-coefficient 0.1. + +# It does seem helpful on average: (-0.35, -0.35, -0.1, 0). +#./compare_wer.sh 5a 5b +#System 5a 5b +#WER on train_dev(tg) 15.86 15.51 +#WER on train_dev(fg) 14.74 14.39 +#WER on eval2000(tg) 17.4 17.3 +#WER on eval2000(fg) 15.6 15.6 +#Final train prob -0.0998359 -0.112013 +#Final valid prob -0.115884 -0.130879 + +# _5a is as _4w, but increasing jesus-forward-output-dim from 1400 to 1800, and +# jesus-forward-input-dim from 400 to 500. Hoping that the cross-entropy regularization +# will mean that the increased parameters are now helpful. + +# _4w is as _4v, but doubling --xent-regularize to 0.2 + +# _4v is as _4r, but with --xent-regularize 0.1. Increasing max_param_change +# from 1.0 to 2.0 because there is a lot of parameter change in the final xent +# layer, and this limits the rate of change of the other layers. + +# _4r is as _4f, but one more hidden layer, and reducing context of existing +# layers so we can re-use the egs. Reducing jesus-forward-output-dim slightly +# from 1500 to 1400. + +# This is better than 4f by almost all metrics. +# ./compare_wer.sh 4f 4r +# System 4f 4r +# WER on train_dev(tg) 16.83 16.50 +# WER on train_dev(fg) 15.73 15.45 +# WER on eval2000(tg) 18.4 18.3 +# WER on eval2000(fg) 16.6 16.7 +# Final train prob -0.105832 -0.103652 +# Final valid prob -0.123021 -0.121105 + +# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005. + +# It's even better than 4e, by about 0.3% abs. +# 4c 4e 4f +# Final valid prob: -0.1241 -0.1267 -0.1230 +# Final train prob: -0.08820 -0.1149 -0.1058 + +# ./show_wer.sh 4f +# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0 +# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 4e +# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0 +# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0 +# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys + + +# _4e is as _4c, but adding the option --l2-regularize 0.0001. + +# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000. + +# _4a is as _3s, but using narrower splice-indexes in the first layer. + +# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400. +# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p. +# This of course reduces overtraining. Results are a bit better than 3p but still +# not as good as 2y + +# ./show_wer.sh 3s +# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 3p +# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0 +# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0 +# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys +# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 2y +# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + + +# _3r is as _3p but reducing the number of parameters as it seemed to be +# overtraining (despite already being quite a small model): [600,1800 -> +# 500,1500]. Also in the interim there was a script change to +# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change. +# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent +# with the halving of the minibatch size.] + + +# _3p is the same as 3o, but after a code and script change so we can use +# natural gradient for the RepeatedAffineComponent. +# [natural gradient was helpful, based on logs; +# also made a change to use positive bias for the jesus-component affine parts.] + +# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2. + +# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support +# recurrence, with improvements to the learning of the jesus layers. + +# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found +# to be worse. +# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence +# is helpful.] +#./show_wer.sh 3g +#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys +#a03:s5c: ./show_wer.sh 2y +#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +#a03:s5c: ./show_wer.sh 3d +#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0 +#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +# _3f is as _3e, but modifying the splicing setup to add (left) recurrence: +# added the :3's in --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" +# Therefore it's +# no longer really a tdnn, more like an RNN combined with TDNN. BTW, I'm not re-dumping egs with extra +# context, and this isn't really ideal - I want to see if this seems promising first. + +# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default) +# to 200 in order to reduce computation in the Jesus layer. + +# _3d is as _2y, and re-using the egs, but using --jesus-opts and +# configs from make_jesus_configs.py. +# --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ +# --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" + +# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from +# 800k to 1.2 million. The aim is to avoid some of the per-job overhead +# (model-averaging, etc.), since each iteration takes only a minute or so. +# I added the results to the table below. It seems the same on average- +# which is good. We'll probably keep this configuration. + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# note: when I compare with the rerun of 2o (not shown), this run is actually +# better. +# WER on 2m 2o 2y [ now comparing 2o->2y:] +# train_dev,tg 17.22 17.24 16.99 0.2% better +# train_dev,fg 15.87 15.93 15.86 0.1% better +# eval2000,tg 18.7 18.7 18.9 0.2% worse +# eval2000,fg 17.0 16.9 17.0 0.1% worse + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_6p # Note: _sp will get added to this if $speed_perturb == true. + +# training options +num_epochs=3 # this is about the same amount of compute as the normal 4, since + # epoch encompasses all frame-shifts of the data and we now have 4 + # frames-shifts rather than 3. (3 * 4 == 4 * 3). +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=2.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 4 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --egs-dir exp/chain/tdnn_6j_sp/egs \ + --frame-subsampling-factor 4 \ + --alignment-subsampling-factor 4 \ + --xent-regularize 0.125 \ + --leaky-hmm-coefficient 0.125 \ + --l2-regularize 0.000075 \ + --jesus-opts "--jesus-forward-input-dim 600 --jesus-forward-output-dim 1700 --jesus-hidden-dim 0 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25 --self-repair-scale 0.00001 --xent-separate-forward-affine=true" \ + --splice-indexes "-1,0,1 -2,-1,0,1,2 -4,-2,0,2 -4,0,4 -4,0,4 -4,0,4" \ + --apply-deriv-weights false \ + --frames-per-iter 1500000 \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --feat-type raw \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; + + echo "0.0075" > $dir/frame_shift # this lets the sclite decoding script know + # what the frame shift was, in seconds. +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_6q.sh b/egs/swbd/s5c/local/chain/run_tdnn_6q.sh new file mode 100755 index 00000000000..440da3a1d6b --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_6q.sh @@ -0,0 +1,493 @@ +#!/bin/bash + +# _6q is as _5n (which is a double-frame-rate system), but putting back +# the iVectors and otherwise changing the configuration as in 5j -> 6g, +# like 'rebasing' the changes onto 6g. +# (note, I forgot the self-repair-scale, and I probably should have used +# 6h as the baseline because it has --xent-separate-forward-affine=true; +# note, this experiment doesn't have --xent-separate-forward-affine=true but +# it would have been better to have it (retrying as 6r) + +# we're about 0.2% better than 6g. +#local/chain/compare_wer.sh 6g 6q +#System 6g 6q +#WER on train_dev(tg) 15.50 15.25 +#WER on train_dev(fg) 14.31 14.24 +#WER on eval2000(tg) 17.5 17.2 +#WER on eval2000(fg) 15.8 15.6 +#Final train prob -0.105853 -0.106936 +#Final valid prob -0.129997 -0.123066 +#Final train prob (xent) -1.4718 -1.66328 +#Final valid prob (xent) -1.55129 -1.71979 + + + +# _5n is as _5j (also omitting the iVectors), but using double the input frame +# rate from 10 to 5 ms (and reducing frame width from 25 to 20), and modifying +# the splice indexes accordingly + +# A very nice improvement on dev; small improvement on eval2000 though. +#local/chain/compare_wer.sh 5j 5n +#System 5j 5n +#WER on train_dev(tg) 17.59 16.85 +#WER on train_dev(fg) 16.33 15.67 +#WER on eval2000(tg) 19.1 19.1 +#WER on eval2000(fg) 17.5 17.3 +#Final train prob -0.114691 -0.116341 +#Final valid prob -0.130761 -0.130884 + +# _5j is as _5e, but omitting the iVectors. + +# Definitely worse, although curiously, there is very little effect on the valid prob. +#./compare_wer.sh 5e 5j +#System 5e 5j +#WER on train_dev(tg) 15.43 17.59 +#WER on train_dev(fg) 14.32 16.33 +#WER on eval2000(tg) 17.3 19.1 +#WER on eval2000(fg) 15.5 17.5 +#Final train prob -0.110056 -0.114691 +#Final valid prob -0.129184 -0.130761 + + +# _5e is as _5b, but reducing --xent-regularize from 0.2 to 0.1 (since based on +# the results of 4v, 4w and 5c, it looks like 0.1 is better than 0.2 or 0.05). + +# The improvement is small but consistent (0.1, 0.1, 0.0, 0.1) and also seen +# in the train and valid probs. +#System 5b 5e +#WER on train_dev(tg) 15.51 15.43 +#WER on train_dev(fg) 14.39 14.32 +#WER on eval2000(tg) 17.3 17.3 +#WER on eval2000(fg) 15.6 15.5 +#Final train prob -0.112013 -0.110056 +#Final valid prob -0.130879 -0.129184 + +# _5b is as _5a, but adding --leaky-hmm-coefficient 0.1. + +# It does seem helpful on average: (-0.35, -0.35, -0.1, 0). +#./compare_wer.sh 5a 5b +#System 5a 5b +#WER on train_dev(tg) 15.86 15.51 +#WER on train_dev(fg) 14.74 14.39 +#WER on eval2000(tg) 17.4 17.3 +#WER on eval2000(fg) 15.6 15.6 +#Final train prob -0.0998359 -0.112013 +#Final valid prob -0.115884 -0.130879 + +# _5a is as _4w, but increasing jesus-forward-output-dim from 1400 to 1800, and +# jesus-forward-input-dim from 400 to 500. Hoping that the cross-entropy regularization +# will mean that the increased parameters are now helpful. + +# _4w is as _4v, but doubling --xent-regularize to 0.2 + +# _4v is as _4r, but with --xent-regularize 0.1. Increasing max_param_change +# from 1.0 to 2.0 because there is a lot of parameter change in the final xent +# layer, and this limits the rate of change of the other layers. + +# _4r is as _4f, but one more hidden layer, and reducing context of existing +# layers so we can re-use the egs. Reducing jesus-forward-output-dim slightly +# from 1500 to 1400. + +# This is better than 4f by almost all metrics. +# ./compare_wer.sh 4f 4r +# System 4f 4r +# WER on train_dev(tg) 16.83 16.50 +# WER on train_dev(fg) 15.73 15.45 +# WER on eval2000(tg) 18.4 18.3 +# WER on eval2000(fg) 16.6 16.7 +# Final train prob -0.105832 -0.103652 +# Final valid prob -0.123021 -0.121105 + +# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005. + +# It's even better than 4e, by about 0.3% abs. +# 4c 4e 4f +# Final valid prob: -0.1241 -0.1267 -0.1230 +# Final train prob: -0.08820 -0.1149 -0.1058 + +# ./show_wer.sh 4f +# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0 +# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 4e +# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0 +# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0 +# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys + + +# _4e is as _4c, but adding the option --l2-regularize 0.0001. + +# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000. + +# _4a is as _3s, but using narrower splice-indexes in the first layer. + +# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400. +# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p. +# This of course reduces overtraining. Results are a bit better than 3p but still +# not as good as 2y + +# ./show_wer.sh 3s +# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 3p +# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0 +# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0 +# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys +# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 2y +# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + + +# _3r is as _3p but reducing the number of parameters as it seemed to be +# overtraining (despite already being quite a small model): [600,1800 -> +# 500,1500]. Also in the interim there was a script change to +# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change. +# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent +# with the halving of the minibatch size.] + + +# _3p is the same as 3o, but after a code and script change so we can use +# natural gradient for the RepeatedAffineComponent. +# [natural gradient was helpful, based on logs; +# also made a change to use positive bias for the jesus-component affine parts.] + +# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2. + +# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support +# recurrence, with improvements to the learning of the jesus layers. + +# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found +# to be worse. +# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence +# is helpful.] +#./show_wer.sh 3g +#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys +#a03:s5c: ./show_wer.sh 2y +#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +#a03:s5c: ./show_wer.sh 3d +#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0 +#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +# _3f is as _3e, but modifying the splicing setup to add (left) recurrence: +# added the :3's in --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" +# Therefore it's +# no longer really a tdnn, more like an RNN combined with TDNN. BTW, I'm not re-dumping egs with extra +# context, and this isn't really ideal - I want to see if this seems promising first. + +# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default) +# to 200 in order to reduce computation in the Jesus layer. + +# _3d is as _2y, and re-using the egs, but using --jesus-opts and +# configs from make_jesus_configs.py. +# --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ +# --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" + +# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from +# 800k to 1.2 million. The aim is to avoid some of the per-job overhead +# (model-averaging, etc.), since each iteration takes only a minute or so. +# I added the results to the table below. It seems the same on average- +# which is good. We'll probably keep this configuration. + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# note: when I compare with the rerun of 2o (not shown), this run is actually +# better. +# WER on 2m 2o 2y [ now comparing 2o->2y:] +# train_dev,tg 17.22 17.24 16.99 0.2% better +# train_dev,fg 15.87 15.93 15.86 0.1% better +# eval2000,tg 18.7 18.7 18.9 0.2% worse +# eval2000,fg 17.0 16.9 17.0 0.1% worse + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=13 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_6q # Note: _sp will get added to this if $speed_perturb == true. + +# training options +num_epochs=2 # this is about the same amount of compute as the normal 4, since one + # epoch encompasses all frame-shifts of the data. +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=2.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=300 # doubling it, since we have half the frame rate. +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +# Generate double-frame-rate version of the data. +if [ $stage -le 12 ]; then + mfccdir=mfcc + for dataset in eval2000 train_dev; do ## ${train_set}; do + utils/copy_data_dir.sh data/$dataset data/${dataset}_hires_dbl + steps/make_mfcc.sh --cmd "$train_cmd" --nj 30 --mfcc-config conf/mfcc_hires_dbl.conf \ + data/${dataset}_hires_dbl exp/make_hires_dbl/$dataset $mfccdir; + steps/compute_cmvn_stats.sh data/${dataset}_hires_dbl exp/make_hires_dbl/$dataset $mfccdir; + utils/fix_data_dir.sh data/${dataset}_hires_dbl # remove segments with problems + done +fi + +if [ $stage -le 13 ]; then + for dataset in eval2000 train_dev ${train_set}; do + mkdir -p exp/nnet3/ivectors_${dataset}_fake2 + cp exp/nnet3/ivectors_${dataset}/ivector_online.scp exp/nnet3/ivectors_${dataset}_fake2 + # verify that the old ivector_period was 10. + [ $(cat exp/nnet3/ivectors_${dataset}/ivector_period) -eq 10 ] || exit 1 + echo 20 > exp/nnet3/ivectors_${dataset}_fake2/ivector_period + done +fi + + +if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --frame-subsampling-factor 6 \ + --alignment-subsampling-factor 3 \ + --xent-regularize 0.1 \ + --leaky-hmm-coefficient 0.1 \ + --l2-regularize 0.00005 \ + --jesus-opts "--jesus-forward-input-dim 500 --jesus-forward-output-dim 1800 --jesus-hidden-dim 7500 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \ + --splice-indexes "-1,0,1 -2,0,2 -4,-2,0,2 -6,0,6 -6,0,6 -12,-6,0" \ + --apply-deriv-weights false \ + --frames-per-iter 3000000 \ + --online-ivector-dir exp/nnet3/ivectors_${train_set}_fake2 \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --feat-type raw \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires_dbl $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; + + echo "0.005" > $dir/frame_shift # this lets the sclite decoding script know + # what the frame shift was, in seconds. +fi + +if [ $stage -le 14 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 15 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set}_fake2 \ + $graph_dir data/${decode_set}_hires_dbl $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires_dbl \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_6r.sh b/egs/swbd/s5c/local/chain/run_tdnn_6r.sh new file mode 100755 index 00000000000..ffbac19d1eb --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_6r.sh @@ -0,0 +1,492 @@ +#!/bin/bash + +# _6r is as _6q, but adding --self-repair-scale 0.00001 +# --xent-separate-forward-affine=true. the appropriate normal-frame-rate +# baseline for this is 6h (since it has --xent-separate-forward-affine=true), +# so using that as the baseline: + +#local/chain/compare_wer.sh 6h 6r +#System 6h 6r +#WER on train_dev(tg) 15.46 15.06 +#WER on train_dev(fg) 14.28 14.05 +#WER on eval2000(tg) 17.4 17.2 +#WER on eval2000(fg) 15.7 15.4 +#Final train prob -0.105663 -0.106685 +#Final valid prob -0.130166 -0.122293 +#Final train prob (xent) -1.42483 -1.62108 +#Final valid prob (xent) -1.49792 -1.67695 + +# _6q is as _5n (which is a double-frame-rate system), but putting back +# the iVectors and otherwise changing the configuration as in 5j -> 6g, +# like 'rebasing' the changes onto 6g. + +# _5n is as _5j (also omitting the iVectors), but using double the input frame +# rate from 10 to 5 ms (and reducing frame width from 25 to 20), and modifying +# the splice indexes accordingly + +# A very nice improvement on dev; small improvement on eval2000 though. +#local/chain/compare_wer.sh 5j 5n +#System 5j 5n +#WER on train_dev(tg) 17.59 16.85 +#WER on train_dev(fg) 16.33 15.67 +#WER on eval2000(tg) 19.1 19.1 +#WER on eval2000(fg) 17.5 17.3 +#Final train prob -0.114691 -0.116341 +#Final valid prob -0.130761 -0.130884 + +# _5j is as _5e, but omitting the iVectors. + +# Definitely worse, although curiously, there is very little effect on the valid prob. +#./compare_wer.sh 5e 5j +#System 5e 5j +#WER on train_dev(tg) 15.43 17.59 +#WER on train_dev(fg) 14.32 16.33 +#WER on eval2000(tg) 17.3 19.1 +#WER on eval2000(fg) 15.5 17.5 +#Final train prob -0.110056 -0.114691 +#Final valid prob -0.129184 -0.130761 + + +# _5e is as _5b, but reducing --xent-regularize from 0.2 to 0.1 (since based on +# the results of 4v, 4w and 5c, it looks like 0.1 is better than 0.2 or 0.05). + +# The improvement is small but consistent (0.1, 0.1, 0.0, 0.1) and also seen +# in the train and valid probs. +#System 5b 5e +#WER on train_dev(tg) 15.51 15.43 +#WER on train_dev(fg) 14.39 14.32 +#WER on eval2000(tg) 17.3 17.3 +#WER on eval2000(fg) 15.6 15.5 +#Final train prob -0.112013 -0.110056 +#Final valid prob -0.130879 -0.129184 + +# _5b is as _5a, but adding --leaky-hmm-coefficient 0.1. + +# It does seem helpful on average: (-0.35, -0.35, -0.1, 0). +#./compare_wer.sh 5a 5b +#System 5a 5b +#WER on train_dev(tg) 15.86 15.51 +#WER on train_dev(fg) 14.74 14.39 +#WER on eval2000(tg) 17.4 17.3 +#WER on eval2000(fg) 15.6 15.6 +#Final train prob -0.0998359 -0.112013 +#Final valid prob -0.115884 -0.130879 + +# _5a is as _4w, but increasing jesus-forward-output-dim from 1400 to 1800, and +# jesus-forward-input-dim from 400 to 500. Hoping that the cross-entropy regularization +# will mean that the increased parameters are now helpful. + +# _4w is as _4v, but doubling --xent-regularize to 0.2 + +# _4v is as _4r, but with --xent-regularize 0.1. Increasing max_param_change +# from 1.0 to 2.0 because there is a lot of parameter change in the final xent +# layer, and this limits the rate of change of the other layers. + +# _4r is as _4f, but one more hidden layer, and reducing context of existing +# layers so we can re-use the egs. Reducing jesus-forward-output-dim slightly +# from 1500 to 1400. + +# This is better than 4f by almost all metrics. +# ./compare_wer.sh 4f 4r +# System 4f 4r +# WER on train_dev(tg) 16.83 16.50 +# WER on train_dev(fg) 15.73 15.45 +# WER on eval2000(tg) 18.4 18.3 +# WER on eval2000(fg) 16.6 16.7 +# Final train prob -0.105832 -0.103652 +# Final valid prob -0.123021 -0.121105 + +# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005. + +# It's even better than 4e, by about 0.3% abs. +# 4c 4e 4f +# Final valid prob: -0.1241 -0.1267 -0.1230 +# Final train prob: -0.08820 -0.1149 -0.1058 + +# ./show_wer.sh 4f +# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0 +# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 4e +# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0 +# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0 +# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys + + +# _4e is as _4c, but adding the option --l2-regularize 0.0001. + +# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000. + +# _4a is as _3s, but using narrower splice-indexes in the first layer. + +# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400. +# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p. +# This of course reduces overtraining. Results are a bit better than 3p but still +# not as good as 2y + +# ./show_wer.sh 3s +# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 3p +# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0 +# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0 +# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys +# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 2y +# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + + +# _3r is as _3p but reducing the number of parameters as it seemed to be +# overtraining (despite already being quite a small model): [600,1800 -> +# 500,1500]. Also in the interim there was a script change to +# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change. +# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent +# with the halving of the minibatch size.] + + +# _3p is the same as 3o, but after a code and script change so we can use +# natural gradient for the RepeatedAffineComponent. +# [natural gradient was helpful, based on logs; +# also made a change to use positive bias for the jesus-component affine parts.] + +# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2. + +# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support +# recurrence, with improvements to the learning of the jesus layers. + +# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found +# to be worse. +# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence +# is helpful.] +#./show_wer.sh 3g +#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys +#a03:s5c: ./show_wer.sh 2y +#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +#a03:s5c: ./show_wer.sh 3d +#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0 +#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +# _3f is as _3e, but modifying the splicing setup to add (left) recurrence: +# added the :3's in --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" +# Therefore it's +# no longer really a tdnn, more like an RNN combined with TDNN. BTW, I'm not re-dumping egs with extra +# context, and this isn't really ideal - I want to see if this seems promising first. + +# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default) +# to 200 in order to reduce computation in the Jesus layer. + +# _3d is as _2y, and re-using the egs, but using --jesus-opts and +# configs from make_jesus_configs.py. +# --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ +# --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" + +# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from +# 800k to 1.2 million. The aim is to avoid some of the per-job overhead +# (model-averaging, etc.), since each iteration takes only a minute or so. +# I added the results to the table below. It seems the same on average- +# which is good. We'll probably keep this configuration. + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# note: when I compare with the rerun of 2o (not shown), this run is actually +# better. +# WER on 2m 2o 2y [ now comparing 2o->2y:] +# train_dev,tg 17.22 17.24 16.99 0.2% better +# train_dev,fg 15.87 15.93 15.86 0.1% better +# eval2000,tg 18.7 18.7 18.9 0.2% worse +# eval2000,fg 17.0 16.9 17.0 0.1% worse + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=14 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_6r # Note: _sp will get added to this if $speed_perturb == true. + +# training options +num_epochs=2 # this is about the same amount of compute as the normal 4, since one + # epoch encompasses all frame-shifts of the data. +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=2.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=300 # doubling it, since we have half the frame rate. +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +# Generate double-frame-rate version of the data. +if [ $stage -le 12 ]; then + mfccdir=mfcc + for dataset in eval2000 train_dev; do ## ${train_set}; do + utils/copy_data_dir.sh data/$dataset data/${dataset}_hires_dbl + steps/make_mfcc.sh --cmd "$train_cmd" --nj 30 --mfcc-config conf/mfcc_hires_dbl.conf \ + data/${dataset}_hires_dbl exp/make_hires_dbl/$dataset $mfccdir; + steps/compute_cmvn_stats.sh data/${dataset}_hires_dbl exp/make_hires_dbl/$dataset $mfccdir; + utils/fix_data_dir.sh data/${dataset}_hires_dbl # remove segments with problems + done +fi + +if [ $stage -le 13 ]; then + for dataset in eval2000 train_dev ${train_set}; do + mkdir -p exp/nnet3/ivectors_${dataset}_fake2 + cp exp/nnet3/ivectors_${dataset}/ivector_online.scp exp/nnet3/ivectors_${dataset}_fake2 + # verify that the old ivector_period was 10. + [ $(cat exp/nnet3/ivectors_${dataset}/ivector_period) -eq 10 ] || exit 1 + echo 20 > exp/nnet3/ivectors_${dataset}_fake2/ivector_period + done +fi + + +if [ $stage -le 14 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --egs-dir exp/chain/tdnn_6q_sp/egs \ + --frame-subsampling-factor 6 \ + --alignment-subsampling-factor 3 \ + --xent-regularize 0.1 \ + --leaky-hmm-coefficient 0.1 \ + --l2-regularize 0.00005 \ + --jesus-opts "--jesus-forward-input-dim 500 --jesus-forward-output-dim 1800 --jesus-hidden-dim 7500 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25 --self-repair-scale 0.00001 --xent-separate-forward-affine=true" \ + --splice-indexes "-1,0,1 -2,0,2 -4,-2,0,2 -6,0,6 -6,0,6 -12,-6,0" \ + --apply-deriv-weights false \ + --frames-per-iter 3000000 \ + --online-ivector-dir exp/nnet3/ivectors_${train_set}_fake2 \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --feat-type raw \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires_dbl $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; + + echo "0.005" > $dir/frame_shift # this lets the sclite decoding script know + # what the frame shift was, in seconds. +fi + +if [ $stage -le 15 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 15 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set}_fake2 \ + $graph_dir data/${decode_set}_hires_dbl $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires_dbl \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_6s.sh b/egs/swbd/s5c/local/chain/run_tdnn_6s.sh new file mode 100755 index 00000000000..4693dde0a31 --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_6s.sh @@ -0,0 +1,502 @@ +#!/bin/bash + + +# _6s is as _6r, but changing the splicing indexes to be exactly the same as 6h, +# but all multiplied by 2. This means that for any given frame-shift, the network +# sees exactly the same input as 6h; the only difference is that we see +# more frame shifts, i.e. the data is more carefully perturbed than 6h. +# this is to help disentangle whether the improvement really comes from the +# higher-resolution features, or from the improved data shifting. + +# So we lose the improvement that we got in 6r (see below). This is consistent +# with the idea that we really do need the higher-frame-rate input, but it's +# also possible that some slight differences in the splicing indexes were +# responsible, so in 6t we'll do an experiment where we try to get closer +# to the splicing setup of 6r. +# +# local/chain/compare_wer.sh 6h 6r 6s +#System 6h 6r 6s +#WER on train_dev(tg) 15.46 15.06 15.50 +#WER on train_dev(fg) 14.28 14.05 14.45 +#WER on eval2000(tg) 17.4 17.2 17.5 +#WER on eval2000(fg) 15.7 15.4 15.7 +#Final train prob -0.105663 -0.106685 -0.105965 +#Final valid prob -0.130166 -0.122293 -0.122376 +#Final train prob (xent) -1.42483 -1.62108 -1.5454 +#Final valid prob (xent) -1.49792 -1.67695 -1.58129 + +# _6r is as _6q, but adding --self-repair-scale 0.00001 --xent-separate-forward-affine=true + +# _6q is as _5n (which is a double-frame-rate system), but putting back +# the iVectors and otherwise changing the configuration as in 5j -> 6g, +# like 'rebasing' the changes onto 6g. + +# _5n is as _5j (also omitting the iVectors), but using double the input frame +# rate from 10 to 5 ms (and reducing frame width from 25 to 20), and modifying +# the splice indexes accordingly + +# A very nice improvement on dev; small improvement on eval2000 though. +#local/chain/compare_wer.sh 5j 5n +#System 5j 5n +#WER on train_dev(tg) 17.59 16.85 +#WER on train_dev(fg) 16.33 15.67 +#WER on eval2000(tg) 19.1 19.1 +#WER on eval2000(fg) 17.5 17.3 +#Final train prob -0.114691 -0.116341 +#Final valid prob -0.130761 -0.130884 + +# _5j is as _5e, but omitting the iVectors. + +# Definitely worse, although curiously, there is very little effect on the valid prob. +#./compare_wer.sh 5e 5j +#System 5e 5j +#WER on train_dev(tg) 15.43 17.59 +#WER on train_dev(fg) 14.32 16.33 +#WER on eval2000(tg) 17.3 19.1 +#WER on eval2000(fg) 15.5 17.5 +#Final train prob -0.110056 -0.114691 +#Final valid prob -0.129184 -0.130761 + + +# _5e is as _5b, but reducing --xent-regularize from 0.2 to 0.1 (since based on +# the results of 4v, 4w and 5c, it looks like 0.1 is better than 0.2 or 0.05). + +# The improvement is small but consistent (0.1, 0.1, 0.0, 0.1) and also seen +# in the train and valid probs. +#System 5b 5e +#WER on train_dev(tg) 15.51 15.43 +#WER on train_dev(fg) 14.39 14.32 +#WER on eval2000(tg) 17.3 17.3 +#WER on eval2000(fg) 15.6 15.5 +#Final train prob -0.112013 -0.110056 +#Final valid prob -0.130879 -0.129184 + +# _5b is as _5a, but adding --leaky-hmm-coefficient 0.1. + +# It does seem helpful on average: (-0.35, -0.35, -0.1, 0). +#./compare_wer.sh 5a 5b +#System 5a 5b +#WER on train_dev(tg) 15.86 15.51 +#WER on train_dev(fg) 14.74 14.39 +#WER on eval2000(tg) 17.4 17.3 +#WER on eval2000(fg) 15.6 15.6 +#Final train prob -0.0998359 -0.112013 +#Final valid prob -0.115884 -0.130879 + +# _5a is as _4w, but increasing jesus-forward-output-dim from 1400 to 1800, and +# jesus-forward-input-dim from 400 to 500. Hoping that the cross-entropy regularization +# will mean that the increased parameters are now helpful. + +# _4w is as _4v, but doubling --xent-regularize to 0.2 + +# _4v is as _4r, but with --xent-regularize 0.1. Increasing max_param_change +# from 1.0 to 2.0 because there is a lot of parameter change in the final xent +# layer, and this limits the rate of change of the other layers. + +# _4r is as _4f, but one more hidden layer, and reducing context of existing +# layers so we can re-use the egs. Reducing jesus-forward-output-dim slightly +# from 1500 to 1400. + +# This is better than 4f by almost all metrics. +# ./compare_wer.sh 4f 4r +# System 4f 4r +# WER on train_dev(tg) 16.83 16.50 +# WER on train_dev(fg) 15.73 15.45 +# WER on eval2000(tg) 18.4 18.3 +# WER on eval2000(fg) 16.6 16.7 +# Final train prob -0.105832 -0.103652 +# Final valid prob -0.123021 -0.121105 + +# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005. + +# It's even better than 4e, by about 0.3% abs. +# 4c 4e 4f +# Final valid prob: -0.1241 -0.1267 -0.1230 +# Final train prob: -0.08820 -0.1149 -0.1058 + +# ./show_wer.sh 4f +# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0 +# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 4e +# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0 +# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0 +# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys + + +# _4e is as _4c, but adding the option --l2-regularize 0.0001. + +# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000. + +# _4a is as _3s, but using narrower splice-indexes in the first layer. + +# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400. +# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p. +# This of course reduces overtraining. Results are a bit better than 3p but still +# not as good as 2y + +# ./show_wer.sh 3s +# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 3p +# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0 +# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0 +# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys +# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 2y +# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + + +# _3r is as _3p but reducing the number of parameters as it seemed to be +# overtraining (despite already being quite a small model): [600,1800 -> +# 500,1500]. Also in the interim there was a script change to +# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change. +# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent +# with the halving of the minibatch size.] + + +# _3p is the same as 3o, but after a code and script change so we can use +# natural gradient for the RepeatedAffineComponent. +# [natural gradient was helpful, based on logs; +# also made a change to use positive bias for the jesus-component affine parts.] + +# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2. + +# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support +# recurrence, with improvements to the learning of the jesus layers. + +# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found +# to be worse. +# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence +# is helpful.] +#./show_wer.sh 3g +#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys +#a03:s5c: ./show_wer.sh 2y +#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +#a03:s5c: ./show_wer.sh 3d +#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0 +#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +# _3f is as _3e, but modifying the splicing setup to add (left) recurrence: +# added the :3's in --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" +# Therefore it's +# no longer really a tdnn, more like an RNN combined with TDNN. BTW, I'm not re-dumping egs with extra +# context, and this isn't really ideal - I want to see if this seems promising first. + +# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default) +# to 200 in order to reduce computation in the Jesus layer. + +# _3d is as _2y, and re-using the egs, but using --jesus-opts and +# configs from make_jesus_configs.py. +# --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ +# --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" + +# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from +# 800k to 1.2 million. The aim is to avoid some of the per-job overhead +# (model-averaging, etc.), since each iteration takes only a minute or so. +# I added the results to the table below. It seems the same on average- +# which is good. We'll probably keep this configuration. + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# note: when I compare with the rerun of 2o (not shown), this run is actually +# better. +# WER on 2m 2o 2y [ now comparing 2o->2y:] +# train_dev,tg 17.22 17.24 16.99 0.2% better +# train_dev,fg 15.87 15.93 15.86 0.1% better +# eval2000,tg 18.7 18.7 18.9 0.2% worse +# eval2000,fg 17.0 16.9 17.0 0.1% worse + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=14 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_6s # Note: _sp will get added to this if $speed_perturb == true. + +# training options +num_epochs=2 # this is about the same amount of compute as the normal 4, since one + # epoch encompasses all frame-shifts of the data. +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=2.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=300 # doubling it, since we have half the frame rate. +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +# Generate double-frame-rate version of the data. +if [ $stage -le 12 ]; then + mfccdir=mfcc + for dataset in eval2000 train_dev; do ## ${train_set}; do + utils/copy_data_dir.sh data/$dataset data/${dataset}_hires_dbl + steps/make_mfcc.sh --cmd "$train_cmd" --nj 30 --mfcc-config conf/mfcc_hires_dbl.conf \ + data/${dataset}_hires_dbl exp/make_hires_dbl/$dataset $mfccdir; + steps/compute_cmvn_stats.sh data/${dataset}_hires_dbl exp/make_hires_dbl/$dataset $mfccdir; + utils/fix_data_dir.sh data/${dataset}_hires_dbl # remove segments with problems + done +fi + +if [ $stage -le 13 ]; then + for dataset in eval2000 train_dev ${train_set}; do + mkdir -p exp/nnet3/ivectors_${dataset}_fake2 + cp exp/nnet3/ivectors_${dataset}/ivector_online.scp exp/nnet3/ivectors_${dataset}_fake2 + # verify that the old ivector_period was 10. + [ $(cat exp/nnet3/ivectors_${dataset}/ivector_period) -eq 10 ] || exit 1 + echo 20 > exp/nnet3/ivectors_${dataset}_fake2/ivector_period + done +fi + + +if [ $stage -le 14 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --frame-subsampling-factor 6 \ + --alignment-subsampling-factor 3 \ + --xent-regularize 0.1 \ + --leaky-hmm-coefficient 0.1 \ + --l2-regularize 0.00005 \ + --jesus-opts "--jesus-forward-input-dim 500 --jesus-forward-output-dim 1800 --jesus-hidden-dim 7500 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25 --self-repair-scale 0.00001 --xent-separate-forward-affine=true" \ + --splice-indexes "-2,0,2 -2,0,2,4 -6,0,6 -6,0,6 -6,0,6 -12,-6,0" \ + --apply-deriv-weights false \ + --frames-per-iter 3000000 \ + --online-ivector-dir exp/nnet3/ivectors_${train_set}_fake2 \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --feat-type raw \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires_dbl $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; + + echo "0.005" > $dir/frame_shift # this lets the sclite decoding script know + # what the frame shift was, in seconds. +fi + +if [ $stage -le 15 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 15 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set}_fake2 \ + $graph_dir data/${decode_set}_hires_dbl $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires_dbl \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_6t.sh b/egs/swbd/s5c/local/chain/run_tdnn_6t.sh new file mode 100755 index 00000000000..47921335155 --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_6t.sh @@ -0,0 +1,512 @@ +#!/bin/bash + +# since _6s didn't work that well, in 6t we try something else: +# modifying 6s to use almost exactly the same splicing indexes as 6r, +# but with the first splice indexes changed from -1,0,1 to -1,1, so that +# all the differences are multiples of 2 (so the effective frame rate is +# the normal frame rate). In effect we're using a narrower splicing +# at the start of the nnet, than 6s. + +# 6t does seem better than 6s, but not quite as good as 6r. +# the fact that it's not as good as 6r may show that the double-frame-rate +# input was actually giving us some useful information-- although the +# improvement is only something like 0.1%-0.2%, and we didn't actually see +# any difference in the objective function from 6r, which undermines the +# notion that by removing that central 0 splice at the input, we lost +# some information. +# +# +#local/chain/compare_wer.sh 6r 6s 6t +#System 6r 6s 6t +#WER on train_dev(tg) 15.06 15.50 15.34 +#WER on train_dev(fg) 14.05 14.45 14.23 +#WER on eval2000(tg) 17.2 17.5 17.2 +#WER on eval2000(fg) 15.4 15.7 15.6 +#Final train prob -0.106685 -0.105965 -0.106575 +#Final valid prob -0.122293 -0.122376 -0.121902 +#Final train prob (xent) -1.62108 -1.5454 -1.62226 +#Final valid prob (xent) -1.67695 -1.58129 -1.67252 + +# _6s is as _6r, but changing the splicing indexes to be exactly the same as 6h, +# but all multiplied by 2. This means that for any given frame-shift, the network +# sees exactly the same input as 6h; the only differences is that we see +# more frame shifts, i.e. the data is more carefully perturbed than 6h. +# this is to help disentangle whether the improvement really comes from the +# higher-resolution features, or from the improved data shifting. + +# _6r is as _6q, but adding --self-repair-scale 0.00001 --xent-separate-forward-affine=true + +# _6q is as _5n (which is a double-frame-rate system), but putting back +# the iVectors and otherwise changing the configuration as in 5j -> 6g, +# like 'rebasing' the changes onto 6g. + +# _5n is as _5j (also omitting the iVectors), but using double the input frame +# rate from 10 to 5 ms (and reducing frame width from 25 to 20), and modifying +# the splice indexes accordingly + +# A very nice improvement on dev; small improvement on eval2000 though. +#local/chain/compare_wer.sh 5j 5n +#System 5j 5n +#WER on train_dev(tg) 17.59 16.85 +#WER on train_dev(fg) 16.33 15.67 +#WER on eval2000(tg) 19.1 19.1 +#WER on eval2000(fg) 17.5 17.3 +#Final train prob -0.114691 -0.116341 +#Final valid prob -0.130761 -0.130884 + +# _5j is as _5e, but omitting the iVectors. + +# Definitely worse, although curiously, there is very little effect on the valid prob. +#./compare_wer.sh 5e 5j +#System 5e 5j +#WER on train_dev(tg) 15.43 17.59 +#WER on train_dev(fg) 14.32 16.33 +#WER on eval2000(tg) 17.3 19.1 +#WER on eval2000(fg) 15.5 17.5 +#Final train prob -0.110056 -0.114691 +#Final valid prob -0.129184 -0.130761 + + +# _5e is as _5b, but reducing --xent-regularize from 0.2 to 0.1 (since based on +# the results of 4v, 4w and 5c, it looks like 0.1 is better than 0.2 or 0.05). + +# The improvement is small but consistent (0.1, 0.1, 0.0, 0.1) and also seen +# in the train and valid probs. +#System 5b 5e +#WER on train_dev(tg) 15.51 15.43 +#WER on train_dev(fg) 14.39 14.32 +#WER on eval2000(tg) 17.3 17.3 +#WER on eval2000(fg) 15.6 15.5 +#Final train prob -0.112013 -0.110056 +#Final valid prob -0.130879 -0.129184 + +# _5b is as _5a, but adding --leaky-hmm-coefficient 0.1. + +# It does seem helpful on average: (-0.35, -0.35, -0.1, 0). +#./compare_wer.sh 5a 5b +#System 5a 5b +#WER on train_dev(tg) 15.86 15.51 +#WER on train_dev(fg) 14.74 14.39 +#WER on eval2000(tg) 17.4 17.3 +#WER on eval2000(fg) 15.6 15.6 +#Final train prob -0.0998359 -0.112013 +#Final valid prob -0.115884 -0.130879 + +# _5a is as _4w, but increasing jesus-forward-output-dim from 1400 to 1800, and +# jesus-forward-input-dim from 400 to 500. Hoping that the cross-entropy regularization +# will mean that the increased parameters are now helpful. + +# _4w is as _4v, but doubling --xent-regularize to 0.2 + +# _4v is as _4r, but with --xent-regularize 0.1. Increasing max_param_change +# from 1.0 to 2.0 because there is a lot of parameter change in the final xent +# layer, and this limits the rate of change of the other layers. + +# _4r is as _4f, but one more hidden layer, and reducing context of existing +# layers so we can re-use the egs. Reducing jesus-forward-output-dim slightly +# from 1500 to 1400. + +# This is better than 4f by almost all metrics. +# ./compare_wer.sh 4f 4r +# System 4f 4r +# WER on train_dev(tg) 16.83 16.50 +# WER on train_dev(fg) 15.73 15.45 +# WER on eval2000(tg) 18.4 18.3 +# WER on eval2000(fg) 16.6 16.7 +# Final train prob -0.105832 -0.103652 +# Final valid prob -0.123021 -0.121105 + +# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005. + +# It's even better than 4e, by about 0.3% abs. +# 4c 4e 4f +# Final valid prob: -0.1241 -0.1267 -0.1230 +# Final train prob: -0.08820 -0.1149 -0.1058 + +# ./show_wer.sh 4f +# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0 +# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 4e +# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0 +# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0 +# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys + + +# _4e is as _4c, but adding the option --l2-regularize 0.0001. + +# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000. + +# _4a is as _3s, but using narrower splice-indexes in the first layer. + +# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400. +# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p. +# This of course reduces overtraining. Results are a bit better than 3p but still +# not as good as 2y + +# ./show_wer.sh 3s +# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 3p +# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0 +# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0 +# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys +# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 2y +# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + + +# _3r is as _3p but reducing the number of parameters as it seemed to be +# overtraining (despite already being quite a small model): [600,1800 -> +# 500,1500]. Also in the interim there was a script change to +# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change. +# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent +# with the halving of the minibatch size.] + + +# _3p is the same as 3o, but after a code and script change so we can use +# natural gradient for the RepeatedAffineComponent. +# [natural gradient was helpful, based on logs; +# also made a change to use positive bias for the jesus-component affine parts.] + +# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2. + +# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support +# recurrence, with improvements to the learning of the jesus layers. + +# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found +# to be worse. +# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence +# is helpful.] +#./show_wer.sh 3g +#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys +#a03:s5c: ./show_wer.sh 2y +#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +#a03:s5c: ./show_wer.sh 3d +#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0 +#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +# _3f is as _3e, but modifying the splicing setup to add (left) recurrence: +# added the :3's in --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" +# Therefore it's +# no longer really a tdnn, more like an RNN combined with TDNN. BTW, I'm not re-dumping egs with extra +# context, and this isn't really ideal - I want to see if this seems promising first. + +# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default) +# to 200 in order to reduce computation in the Jesus layer. + +# _3d is as _2y, and re-using the egs, but using --jesus-opts and +# configs from make_jesus_configs.py. +# --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ +# --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" + +# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from +# 800k to 1.2 million. The aim is to avoid some of the per-job overhead +# (model-averaging, etc.), since each iteration takes only a minute or so. +# I added the results to the table below. It seems the same on average- +# which is good. We'll probably keep this configuration. + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# note: when I compare with the rerun of 2o (not shown), this run is actually +# better. +# WER on 2m 2o 2y [ now comparing 2o->2y:] +# train_dev,tg 17.22 17.24 16.99 0.2% better +# train_dev,fg 15.87 15.93 15.86 0.1% better +# eval2000,tg 18.7 18.7 18.9 0.2% worse +# eval2000,fg 17.0 16.9 17.0 0.1% worse + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=14 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_6t # Note: _sp will get added to this if $speed_perturb == true. + +# training options +num_epochs=2 # this is about the same amount of compute as the normal 4, since one + # epoch encompasses all frame-shifts of the data. +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=2.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=300 # doubling it, since we have half the frame rate. +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +# Generate double-frame-rate version of the data. +if [ $stage -le 12 ]; then + mfccdir=mfcc + for dataset in eval2000 train_dev; do ## ${train_set}; do + utils/copy_data_dir.sh data/$dataset data/${dataset}_hires_dbl + steps/make_mfcc.sh --cmd "$train_cmd" --nj 30 --mfcc-config conf/mfcc_hires_dbl.conf \ + data/${dataset}_hires_dbl exp/make_hires_dbl/$dataset $mfccdir; + steps/compute_cmvn_stats.sh data/${dataset}_hires_dbl exp/make_hires_dbl/$dataset $mfccdir; + utils/fix_data_dir.sh data/${dataset}_hires_dbl # remove segments with problems + done +fi + +if [ $stage -le 13 ]; then + for dataset in eval2000 train_dev ${train_set}; do + mkdir -p exp/nnet3/ivectors_${dataset}_fake2 + cp exp/nnet3/ivectors_${dataset}/ivector_online.scp exp/nnet3/ivectors_${dataset}_fake2 + # verify that the old ivector_period was 10. + [ $(cat exp/nnet3/ivectors_${dataset}/ivector_period) -eq 10 ] || exit 1 + echo 20 > exp/nnet3/ivectors_${dataset}_fake2/ivector_period + done +fi + + +if [ $stage -le 14 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --egs-dir exp/chain/tdnn_6s_sp/egs \ + --frame-subsampling-factor 6 \ + --alignment-subsampling-factor 3 \ + --xent-regularize 0.1 \ + --leaky-hmm-coefficient 0.1 \ + --l2-regularize 0.00005 \ + --jesus-opts "--jesus-forward-input-dim 500 --jesus-forward-output-dim 1800 --jesus-hidden-dim 7500 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25 --self-repair-scale 0.00001 --xent-separate-forward-affine=true" \ + --splice-indexes "-1,1 -2,0,2 -4,-2,0,2 -6,0,6 -6,0,6 -12,-6,0" \ + --apply-deriv-weights false \ + --frames-per-iter 3000000 \ + --online-ivector-dir exp/nnet3/ivectors_${train_set}_fake2 \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --feat-type raw \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires_dbl $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; + + echo "0.005" > $dir/frame_shift # this lets the sclite decoding script know + # what the frame shift was, in seconds. +fi + +if [ $stage -le 15 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 15 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set}_fake2 \ + $graph_dir data/${decode_set}_hires_dbl $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires_dbl \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_6u.sh b/egs/swbd/s5c/local/chain/run_tdnn_6u.sh new file mode 100755 index 00000000000..4c48a75ffd6 --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_6u.sh @@ -0,0 +1,524 @@ +#!/bin/bash + +# _6u is as _6h, but with slightly different splicing indexes (start +# narrower than 6h and ramp up slowly). These are designed to be +# equivalent to those in 6t, except for use with normal-frame-rate, +# not double-frame-rate, input. The difference between 6t and 6u +# will show us whether having double-frame-rate input for the purpose +# of getting more different shifted versions of the input, is helpful. +# [however, note that the number of frames-per-iter is not comparable +# between 6t and 6u: here we're using 1.2 million frames per eg, +# and 6s is using 3 million which at the normal frame rate would be +# 1.5 million, and 1.2 != 1.5. + +# 6u is no better than 6h, and maybe slightly worse. Certainly it's worse than +# 6t. In addition, the train-valid difference is bigger with 6h and 6u than +# with 6t. This is all consistent with the notion that the higher-frame-rate +# input, with with we can generate more shifted versions, does really make a +# difference. However, I want to wait till the 6v->6w comparison is ready, +# which may let us know whether the difference in frames-per-iter could have +# been a confounding factor here. (It's unlikely, but possible). +# +#local/chain/compare_wer.sh 6h 6t 6u +#System 6h 6t 6u +#WER on train_dev(tg) 15.46 15.34 15.46 +#WER on train_dev(fg) 14.28 14.23 14.28 +#WER on eval2000(tg) 17.4 17.2 17.6 +#WER on eval2000(fg) 15.7 15.6 15.9 +#Final train prob -0.105663 -0.106575 -0.108665 +#Final valid prob -0.130166 -0.121902 -0.129495 +#Final train prob (xent) -1.42483 -1.62226 -1.54189 +#Final valid prob (xent) -1.49792 -1.67252 -1.60749 + +# _6h is as _6g but adding --xent-separate-forward-affine=true, which +# gives a separate last-but-one weight matrix to the xent output. + +# Although this slight improvement is probably not significant, it's a +# sensible idea so I think I'll stick with it. +#local/chain/compare_wer.sh 6g 6h +#System 6g 6h +#WER on train_dev(tg) 15.50 15.46 +#WER on train_dev(fg) 14.31 14.28 +#WER on eval2000(tg) 17.5 17.4 +#WER on eval2000(fg) 15.8 15.7 +#Final train prob -0.105853 -0.105663 +#Final valid prob -0.129997 -0.130166 + +# _6g is as _6f but increasing the parameters (increasing +# jesus-forward-input-from from 500 to 600). + +# _6f is as _5v, but setting --jesus-hidden-dim to 0 which with a script change +# means there is no hidden part in the jesus layer (it's just repeated affine and relu). + +# slightly worse, but encouragingly small difference. +#local/chain/compare_wer.sh 5v 6f +#System 5v 6f +#WER on train_dev(tg) 15.38 15.71 +#WER on train_dev(fg) 14.39 14.50 +#WER on eval2000(tg) 17.4 17.5 +#WER on eval2000(fg) 15.7 15.9 +#Final train prob -0.11156 -0.111305 +#Final valid prob -0.131797 -0.131487 + +# _5v is as _5t, but further reducing the --jesus-hidden-dim from 3500 to 2500. + +# WER is almost the same, perhaps <0.1% worse; diagnostics are slightly worse. +# +#local/chain/compare_wer.sh 5e 5s 5t 5v +#System 5e 5s 5t 5v +#WER on train_dev(tg) 15.43 15.47 15.43 15.38 +#WER on train_dev(fg) 14.32 14.31 14.34 14.39 +#WER on eval2000(tg) 17.3 17.4 17.4 17.4 +#WER on eval2000(fg) 15.5 15.6 15.6 15.7 +#Final train prob -0.110056 -0.110928 -0.110752 -0.11156 +#Final valid prob -0.129184 -0.132139 -0.129123 -0.131797 + +# _5t is as _5s but further reducing the jesus-hidden-dim (trying to speed it +# up), from 5000 to 3500. + +# about 5s: comparing with 5e which is the most recent baseline we actually +# decoded, 5s is as 5e but with jesus-forward-output-dim reduced 1800->1700, +# jesus-hidden-dim reduced 7500 to 5000, and and the new option +# --self-repair-scale 0.00001 added. Also compare 5t and 5v which have even +# smaller jesus-hidden-dims. + +# _5s is as _5r but increasing the jesus-forward-output-dim to the intermediate +# value of 1700 (between 1500 and 1800), and also a bug-fix in the self-repair +# code to a bug which was doubling the thresholds so there was, in effect, +# no upper threshold. I stopped the p,q,r runs after I found this, but in +# configuring this run I'm bearing in mind the train and valid probs from the +# p,q,r runs. + +# _5r is as _5q but also reducing --jesus-hidden-dim from 7500 to 5000. + +# _5q is as _5p but reducing jesus-forward-output-dim from 1800 to 1500 to try +# to compensate for the fact that more of the output dimensions are now being +# usefully used. + +# _5p is as _5e but adding (new option) --self-repair-scale 0.00001, to repair +# ReLUs that are over or under-saturated. + +# _5e is as _5b, but reducing --xent-regularize from 0.2 to 0.1 (since based on +# the results of 4v, 4w and 5c, it looks like 0.1 is better than 0.2 or 0.05). + +# The improvement is small but consistent (0.1, 0.1, 0.0, 0.1) and also seen +# in the train and valid probs. +#System 5b 5e +#WER on train_dev(tg) 15.51 15.43 +#WER on train_dev(fg) 14.39 14.32 +#WER on eval2000(tg) 17.3 17.3 +#WER on eval2000(fg) 15.6 15.5 +#Final train prob -0.112013 -0.110056 +#Final valid prob -0.130879 -0.129184 + +# _5b is as _5a, but adding --leaky-hmm-coefficient 0.1. + +# It does seem helpful on average: (-0.35, -0.35, -0.1, 0). +#./compare_wer.sh 5a 5b +#System 5a 5b +#WER on train_dev(tg) 15.86 15.51 +#WER on train_dev(fg) 14.74 14.39 +#WER on eval2000(tg) 17.4 17.3 +#WER on eval2000(fg) 15.6 15.6 +#Final train prob -0.0998359 -0.112013 +#Final valid prob -0.115884 -0.130879 + +# _5a is as _4w, but increasing jesus-forward-output-dim from 1400 to 1800, and +# jesus-forward-input-dim from 400 to 500. Hoping that the cross-entropy regularization +# will mean that the increased parameters are now helpful. +# quite helpful: +#local/chain/compare_wer.sh 4w 5a +#System 4w 5a +#WER on train_dev(tg) 16.05 15.86 +#WER on train_dev(fg) 14.92 14.74 +#WER on eval2000(tg) 18.0 17.4 +#WER on eval2000(fg) 16.2 15.6 +#Final train prob -0.108816-0.0998359 +#Final valid prob -0.118254 -0.115884 + +# _4w is as _4v, but doubling --xent-regularize to 0.2 + +# _4v is as _4r, but with --xent-regularize 0.1. Increasing max_param_change +# from 1.0 to 2.0 because there is a lot of parameter change in the final xent +# layer, and this limits the rate of change of the other layers. + +# _4r is as _4f, but one more hidden layer, and reducing context of existing +# layers so we can re-use the egs. Reducing jesus-forward-output-dim slightly +# from 1500 to 1400. + +# This is better than 4f by almost all metrics. +# ./compare_wer.sh 4f 4r +# System 4f 4r +# WER on train_dev(tg) 16.83 16.50 +# WER on train_dev(fg) 15.73 15.45 +# WER on eval2000(tg) 18.4 18.3 +# WER on eval2000(fg) 16.6 16.7 +# Final train prob -0.105832 -0.103652 +# Final valid prob -0.123021 -0.121105 + +# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005. + +# It's even better than 4e, by about 0.3% abs. +# 4c 4e 4f +# Final valid prob: -0.1241 -0.1267 -0.1230 +# Final train prob: -0.08820 -0.1149 -0.1058 + +# ./show_wer.sh 4f +# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0 +# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 4e +# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0 +# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0 +# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys + + +# _4e is as _4c, but adding the option --l2-regularize 0.0001. + +# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000. + +# _4a is as _3s, but using narrower splice-indexes in the first layer. + +# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400. +# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p. +# This of course reduces overtraining. Results are a bit better than 3p but still +# not as good as 2y + +# ./show_wer.sh 3s +# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 3p +# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0 +# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0 +# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys +# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 2y +# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + + +# _3r is as _3p but reducing the number of parameters as it seemed to be +# overtraining (despite already being quite a small model): [600,1800 -> +# 500,1500]. Also in the interim there was a script change to +# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change. +# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent +# with the halving of the minibatch size.] + + +# _3p is the same as 3o, but after a code and script change so we can use +# natural gradient for the RepeatedAffineComponent. +# [natural gradient was helpful, based on logs; +# also made a change to use positive bias for the jesus-component affine parts.] + +# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2. + +# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support +# recurrence, with improvements to the learning of the jesus layers. + +# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found +# to be worse. +# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence +# is helpful.] +#./show_wer.sh 3g +#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys +#a03:s5c: ./show_wer.sh 2y +#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +#a03:s5c: ./show_wer.sh 3d +#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0 +#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +# _3f is as _3e, but modifying the splicing setup to add (left) recurrence: +# added the :3's in --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" +# Therefore it's +# no longer really a tdnn, more like an RNN combined with TDNN. BTW, I'm not re-dumping egs with extra +# context, and this isn't really ideal - I want to see if this seems promising first. + +# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default) +# to 200 in order to reduce computation in the Jesus layer. + +# _3d is as _2y, and re-using the egs, but using --jesus-opts and +# configs from make_jesus_configs.py. +# --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ +# --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" + +# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from +# 800k to 1.2 million. The aim is to avoid some of the per-job overhead +# (model-averaging, etc.), since each iteration takes only a minute or so. +# I added the results to the table below. It seems the same on average- +# which is good. We'll probably keep this configuration. + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# note: when I compare with the rerun of 2o (not shown), this run is actually +# better. +# WER on 2m 2o 2y [ now comparing 2o->2y:] +# train_dev,tg 17.22 17.24 16.99 0.2% better +# train_dev,fg 15.87 15.93 15.86 0.1% better +# eval2000,tg 18.7 18.7 18.9 0.2% worse +# eval2000,fg 17.0 16.9 17.0 0.1% worse + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_6u # Note: _sp will get added to this if $speed_perturb == true. + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=2.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --xent-regularize 0.1 \ + --leaky-hmm-coefficient 0.1 \ + --l2-regularize 0.00005 \ + --egs-dir exp/chain/tdnn_2y_sp/egs \ + --jesus-opts "--jesus-forward-input-dim 600 --jesus-forward-output-dim 1700 --jesus-hidden-dim 0 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25 --self-repair-scale 0.00001 --xent-separate-forward-affine=true" \ + --splice-indexes "-1,0 -1,0,1 -2,-1,0,1 -3,0,3 -3,0,3 -6,-3,0" \ + --apply-deriv-weights false \ + --frames-per-iter 1200000 \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context 20 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_6v.sh b/egs/swbd/s5c/local/chain/run_tdnn_6v.sh new file mode 100755 index 00000000000..df711d31aa1 --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_6v.sh @@ -0,0 +1,283 @@ +#!/bin/bash +# This script contains online decoding using chain + nnet3 setup. +# _6v is as _6h, but moving to a TDNN+ReLU recipe instead of using jesus-layer. +# Otherwise we make everything as similar as possible to 6h. +# The ReLU dimension, at 576, is chosen to make the number of parameters about +# the same as 6h. + +# great improvement! +# local/chain/compare_wer.sh 6h 6v +# System 6h 6v +# WER on train_dev(tg) 15.46 15.00 +# WER on train_dev(fg) 14.28 13.91 +# WER on eval2000(tg) 17.4 17.2 +# WER on eval2000(fg) 15.7 15.7 + +# the following objf values are computed on the last iter (323), because due to +# a script bug, now fixed, the 'final' ones were not computed in 6v. +# note: in this run the xent learning rate was too slow. +# 323 train prob -0.129285 -0.120026 +# 323 valid prob -0.151648 -0.140628 +# 323 train prob (xent) -1.4443 -1.5431 +# 323 valid prob (xent) -1.51731 -1.56975 + + +set -e + +# configs for 'chain' +affix= +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_6v # Note: _sp will get added to this if $speed_perturb == true. +decode_iter= + +# TDNN options +# this script uses the new tdnn config generator so it needs a final 0 to reflect that the final layer input has no splicing +# smoothing options +pool_window= +pool_type='none' +pool_lpfilter_width= +self_repair_scale=0.00001 +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=2.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +relu_dim=576 +frames_per_eg=150 +remove_egs=false +common_egs_dir= +xent_regularize=0.1 + + + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + echo "$0: creating neural net configs"; + if [ ! -z "$relu_dim" ]; then + dim_opts="--relu-dim $relu_dim" + else + dim_opts="--pnorm-input-dim $pnorm_input_dim --pnorm-output-dim $pnorm_output_dim" + fi + + # create the config files for nnet initialization + pool_opts= + pool_opts=$pool_opts${pool_type:+" --pool-type $pool_type "} + pool_opts=$pool_opts${pool_window:+" --pool-window $pool_window "} + pool_opts=$pool_opts${pool_lpfilter_width:+" --pool-lpfilter-width $pool_lpfilter_width "} + repair_opts=${self_repair_scale:+" --self-repair-scale $self_repair_scale "} + + steps/nnet3/tdnn/make_configs.py $pool_opts \ + $repair_opts \ + --feat-dir data/${train_set}_hires \ + --ivector-dir exp/nnet3/ivectors_${train_set} \ + --tree-dir $treedir \ + $dim_opts \ + --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3 -3,0,3 -6,-3,0 0" \ + --use-presoftmax-prior-scale false \ + --xent-regularize $xent_regularize \ + --xent-separate-forward-affine true \ + --include-log-softmax false \ + --final-layer-normalize-target $final_layer_normalize_target \ + $dir/configs || exit 1; +fi + + + +if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.dir exp/chain/tdnn_2y_sp/egs \ + --egs.stage $get_egs_stage \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width $frames_per_eg \ + --trainer.num-chunk-per-minibatch $minibatch_size \ + --trainer.frames-per-iter 1200000 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.num-jobs-initial $num_jobs_initial \ + --trainer.optimization.num-jobs-final $num_jobs_final \ + --trainer.optimization.initial-effective-lrate $initial_effective_lrate \ + --trainer.optimization.final-effective-lrate $final_effective_lrate \ + --trainer.max-param-change $max_param_change \ + --cleanup.remove-egs $remove_egs \ + --feat-dir data/${train_set}_hires \ + --tree-dir $treedir \ + --lat-dir exp/tri4_lats_nodup$suffix \ + --dir $dir || exit 1; + +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + iter_opts=" --iter $decode_iter " + fi + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj 50 --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi + +# Results using offline and online decoding +# System 6v_sp 6v_sp_online 6v_sp_online{per_utt} +# WER on train_dev(tg) 14.68 14.72 15.43 +# WER on train_dev(fg) 13.49 13.58 14.18 +# WER on eval2000(tg) 17.2 17.3 18.2 +# WER on eval2000(fg) 15.7 15.9 16.7 + +#if [ $stage -le 15 ]; then +# # If this setup used PLP features, we'd have to give the option --feature-type plp +# # to the script below. +# steps/online/nnet3/prepare_online_decoding.sh --mfcc-config conf/mfcc_hires.conf \ +# data/lang exp/nnet3/extractor "$dir" ${dir}_online || exit 1; +#fi + + + +#if [ $stage -le 16 ]; then +# iter_opts= +# if [ ! -z $decode_iter ]; then +# iter_opts=" --iter $decode_iter " +# fi +# for decode_set in train_dev eval2000; do +# ( +# steps/online/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ +# --nj 50 --cmd "$decode_cmd" $iter_opts --config conf/decode_online.config \ +# $graph_dir data/${decode_set}_hires ${dir}_online/decode_${decode_set}${decode_iter:+_$decode_iter}_${decode_suff} || exit 1; +# if $has_fisher; then +# steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ +# data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ +# ${dir}_online/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_{tg,fsh_fg} || exit 1; +# fi +# ) & +# done +#fi +# +#if [ $stage -le 17 ]; then +# iter_opts= +# if [ ! -z $decode_iter ]; then +# iter_opts=" --iter $decode_iter " +# fi +# for decode_set in train_dev eval2000; do +# ( +# steps/online/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 --config conf/decode_online.config \ +# --nj 50 --cmd "$decode_cmd" $iter_opts --per-utt true \ +# $graph_dir data/${decode_set}_hires ${dir}_online/decode_${decode_set}${decode_iter:+_$decode_iter}_${decode_suff}_per_utt || exit 1; +# if $has_fisher; then +# steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ +# data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ +# ${dir}_online/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_{tg,fsh_fg}_per_utt || exit 1; +# fi +# ) & +# done +#fi +# +wait; + +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_6w.sh b/egs/swbd/s5c/local/chain/run_tdnn_6w.sh new file mode 100755 index 00000000000..3e3bb622290 --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_6w.sh @@ -0,0 +1,234 @@ +#!/bin/bash + +# 6w is as 6v (a new tdnn-based recipe), but using 1.5 million not 1.2 million +# frames per iter (and of course re-dumping the egs). + +# I discovered after running this that there was a problem with the egs-dumping, +# which seems to have existed for quite a while: the --right-tolerance defaults to 10 +# in the script, but it should have been 5, to match the code. However, 6v was +# run with older egs (before this bug was introduced) from 2y, so it doesn't +# have the problem. + +# note regarding the changes in objfs: these have explanations, they are due to +# the --right-tolerance increasing from 5->10 in 6v->6w: the chain objfs improve +# because of the less-restrictive numerator graphs, and the xent objfs get worse +# because the phone alignments become less consistent; we can see the reverse +# pattern in 6y -> 6z when we revert the right-tolerance back to 5. +# +#local/chain/compare_wer.sh 6v 6w +#System 6v 6w +#WER on train_dev(tg) 15.00 15.33 +#WER on train_dev(fg) 13.91 14.27 +#WER on eval2000(tg) 17.2 17.3 +#WER on eval2000(fg) 15.7 15.6 +#Final train prob -0.105012 -0.10287 +#Final valid prob -0.125877 -0.120451 +#Final train prob (xent) -1.54736 -1.63586 +#Final valid prob (xent) -1.57475 -1.67173 + + + +# this is same as v2 script but with xent-regularization +# it has a different splicing configuration +set -e + +# configs for 'chain' +affix= +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_6w # Note: _sp will get added to this if $speed_perturb == true. +decode_iter= + +# TDNN options +# this script uses the new tdnn config generator so it needs a final 0 to reflect that the final layer input has no splicing +# smoothing options +pool_window= +pool_type='none' +pool_lpfilter_width= +self_repair_scale=0.00001 +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=2.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +relu_dim=576 +frames_per_eg=150 +remove_egs=false +common_egs_dir= +xent_regularize=0.1 + + + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + echo "$0: creating neural net configs"; + if [ ! -z "$relu_dim" ]; then + dim_opts="--relu-dim $relu_dim" + else + dim_opts="--pnorm-input-dim $pnorm_input_dim --pnorm-output-dim $pnorm_output_dim" + fi + + # create the config files for nnet initialization + pool_opts= + pool_opts=$pool_opts${pool_type:+" --pool-type $pool_type "} + pool_opts=$pool_opts${pool_window:+" --pool-window $pool_window "} + pool_opts=$pool_opts${pool_lpfilter_width:+" --pool-lpfilter-width $pool_lpfilter_width "} + repair_opts=${self_repair_scale:+" --self-repair-scale $self_repair_scale "} + + steps/nnet3/tdnn/make_configs.py $pool_opts \ + $repair_opts \ + --feat-dir data/${train_set}_hires \ + --ivector-dir exp/nnet3/ivectors_${train_set} \ + --tree-dir $treedir \ + $dim_opts \ + --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3 -3,0,3 -6,-3,0 0" \ + --use-presoftmax-prior-scale false \ + --xent-regularize $xent_regularize \ + --xent-separate-forward-affine true \ + --include-log-softmax false \ + --final-layer-normalize-target $final_layer_normalize_target \ + $dir/configs || exit 1; +fi + + + +if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.stage $get_egs_stage \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width $frames_per_eg \ + --trainer.num-chunk-per-minibatch $minibatch_size \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.num-jobs-initial $num_jobs_initial \ + --trainer.optimization.num-jobs-final $num_jobs_final \ + --trainer.optimization.initial-effective-lrate $initial_effective_lrate \ + --trainer.optimization.final-effective-lrate $final_effective_lrate \ + --trainer.max-param-change $max_param_change \ + --cleanup.remove-egs $remove_egs \ + --feat-dir data/${train_set}_hires \ + --tree-dir $treedir \ + --lat-dir exp/tri4_lats_nodup$suffix \ + --dir $dir || exit 1; + +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + iter_opts=" --iter $decode_iter " + fi + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj 50 --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_6x.sh b/egs/swbd/s5c/local/chain/run_tdnn_6x.sh new file mode 100755 index 00000000000..177ddd2a867 --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_6x.sh @@ -0,0 +1,229 @@ +#!/bin/bash + +# 6x is as 6w, but changing the splice-indexes to be like in 6u +# except since this is a TDNN setup, we need a final "0" [the jesus-layer +# setup had a final ReLU as a special case.]. +# These splice indexes start smaller, and ramp up more slowly, than +# the baseline in 6w. +# We're reusing the 6x egs. + +# no clear benefit; if anything, it's slightly worse. +# local/chain/compare_wer.sh 6w 6x +# System 6w 6x +# WER on train_dev(tg) 15.33 15.30 +# WER on train_dev(fg) 14.27 14.35 +# WER on eval2000(tg) 17.3 17.4 +# WER on eval2000(fg) 15.6 15.7 +# Final train prob -0.10287 -0.103078 +# Final valid prob -0.120451 -0.122477 +# Final train prob (xent) -1.63586 -1.73292 +# Final valid prob (xent) -1.67173 -1.75042 + +# 6w is as 6v (a new tdnn-based recipe), but using 1.5 million not 1.2 million +# frames per iter (and of course re-dumping the egs). + +# this is same as v2 script but with xent-regularization +# it has a different splicing configuration +set -e + +# configs for 'chain' +affix= +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_6x # Note: _sp will get added to this if $speed_perturb == true. +decode_iter= + +# TDNN options +# this script uses the new tdnn config generator so it needs a final 0 to reflect that the final layer input has no splicing +# smoothing options +pool_window= +pool_type='none' +pool_lpfilter_width= +self_repair_scale=0.00001 +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=2.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +relu_dim=576 +frames_per_eg=150 +remove_egs=false +common_egs_dir= +xent_regularize=0.1 + + + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + echo "$0: creating neural net configs"; + if [ ! -z "$relu_dim" ]; then + dim_opts="--relu-dim $relu_dim" + else + dim_opts="--pnorm-input-dim $pnorm_input_dim --pnorm-output-dim $pnorm_output_dim" + fi + + # create the config files for nnet initialization + pool_opts= + pool_opts=$pool_opts${pool_type:+" --pool-type $pool_type "} + pool_opts=$pool_opts${pool_window:+" --pool-window $pool_window "} + pool_opts=$pool_opts${pool_lpfilter_width:+" --pool-lpfilter-width $pool_lpfilter_width "} + repair_opts=${self_repair_scale:+" --self-repair-scale $self_repair_scale "} + + steps/nnet3/tdnn/make_configs.py $pool_opts \ + $repair_opts \ + --feat-dir data/${train_set}_hires \ + --ivector-dir exp/nnet3/ivectors_${train_set} \ + --tree-dir $treedir \ + $dim_opts \ + --splice-indexes "-1,0 -1,0,1 -2,-1,0,1 -3,0,3 -3,0,3 -6,-3,0 0" \ + --use-presoftmax-prior-scale false \ + --xent-regularize $xent_regularize \ + --xent-separate-forward-affine true \ + --include-log-softmax false \ + --final-layer-normalize-target $final_layer_normalize_target \ + $dir/configs || exit 1; +fi + + + +if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --egs.dir exp/chain/tdnn_6w_sp/egs \ + --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.stage $get_egs_stage \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width $frames_per_eg \ + --trainer.num-chunk-per-minibatch $minibatch_size \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.num-jobs-initial $num_jobs_initial \ + --trainer.optimization.num-jobs-final $num_jobs_final \ + --trainer.optimization.initial-effective-lrate $initial_effective_lrate \ + --trainer.optimization.final-effective-lrate $final_effective_lrate \ + --trainer.max-param-change $max_param_change \ + --cleanup.remove-egs $remove_egs \ + --feat-dir data/${train_set}_hires \ + --tree-dir $treedir \ + --lat-dir exp/tri4_lats_nodup$suffix \ + --dir $dir || exit 1; + +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + iter_opts=" --iter $decode_iter " + fi + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj 50 --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_6y.sh b/egs/swbd/s5c/local/chain/run_tdnn_6y.sh new file mode 100755 index 00000000000..a15c6648641 --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_6y.sh @@ -0,0 +1,227 @@ +#!/bin/bash + +# 6y is as 6w, but after fixing the config-generation script to use +# a higher learning-rate factor for the final xent layer (it was otherwise +# training too slowly). + +# WER results are inconclusive, but objective values are encouraging. +# We'll keep the change as it makes sense. +# local/chain/compare_wer.sh 6w 6y +# System 6w 6y +# WER on train_dev(tg) 15.33 15.36 +# WER on train_dev(fg) 14.27 14.19 +# WER on eval2000(tg) 17.3 17.2 +# WER on eval2000(fg) 15.6 15.8 +# Final train prob -0.10287 -0.102139 +# Final valid prob -0.120451 -0.119654 +# Final train prob (xent) -1.63586 -1.55598 +# Final valid prob (xent) -1.67173 -1.58821 + +# 6w is as 6v (a new tdnn-based recipe), but using 1.5 million not 1.2 million +# frames per iter (and of course re-dumping the egs). + +# this is same as v2 script but with xent-regularization +# it has a different splicing configuration +set -e + +# configs for 'chain' +affix= +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_6y # Note: _sp will get added to this if $speed_perturb == true. +decode_iter= + +# TDNN options +# this script uses the new tdnn config generator so it needs a final 0 to reflect that the final layer input has no splicing +# smoothing options +pool_window= +pool_type='none' +pool_lpfilter_width= +self_repair_scale=0.00001 +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=2.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +relu_dim=576 +frames_per_eg=150 +remove_egs=false +common_egs_dir= +xent_regularize=0.1 + + + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + echo "$0: creating neural net configs"; + if [ ! -z "$relu_dim" ]; then + dim_opts="--relu-dim $relu_dim" + else + dim_opts="--pnorm-input-dim $pnorm_input_dim --pnorm-output-dim $pnorm_output_dim" + fi + + # create the config files for nnet initialization + pool_opts= + pool_opts=$pool_opts${pool_type:+" --pool-type $pool_type "} + pool_opts=$pool_opts${pool_window:+" --pool-window $pool_window "} + pool_opts=$pool_opts${pool_lpfilter_width:+" --pool-lpfilter-width $pool_lpfilter_width "} + repair_opts=${self_repair_scale:+" --self-repair-scale $self_repair_scale "} + + steps/nnet3/tdnn/make_configs.py $pool_opts \ + $repair_opts \ + --feat-dir data/${train_set}_hires \ + --ivector-dir exp/nnet3/ivectors_${train_set} \ + --tree-dir $treedir \ + $dim_opts \ + --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3 -3,0,3 -6,-3,0 0" \ + --use-presoftmax-prior-scale false \ + --xent-regularize $xent_regularize \ + --xent-separate-forward-affine true \ + --include-log-softmax false \ + --final-layer-normalize-target $final_layer_normalize_target \ + $dir/configs || exit 1; +fi + + + +if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --egs.dir exp/chain/tdnn_6w_sp/egs \ + --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.stage $get_egs_stage \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width $frames_per_eg \ + --trainer.num-chunk-per-minibatch $minibatch_size \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.num-jobs-initial $num_jobs_initial \ + --trainer.optimization.num-jobs-final $num_jobs_final \ + --trainer.optimization.initial-effective-lrate $initial_effective_lrate \ + --trainer.optimization.final-effective-lrate $final_effective_lrate \ + --trainer.max-param-change $max_param_change \ + --cleanup.remove-egs $remove_egs \ + --feat-dir data/${train_set}_hires \ + --tree-dir $treedir \ + --lat-dir exp/tri4_lats_nodup$suffix \ + --dir $dir || exit 1; + +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + iter_opts=" --iter $decode_iter " + fi + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj 50 --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_6z.sh b/egs/swbd/s5c/local/chain/run_tdnn_6z.sh new file mode 100755 index 00000000000..97cc1b83624 --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_6z.sh @@ -0,0 +1,231 @@ +#!/bin/bash + +# 6z is as 6y, but fixing the right-tolerance in the scripts to default to 5 (as +# the default is in the code), rather than the previous script default value of +# 10 which I seem to have added to the script around Feb 9th. +# definitely better than 6y- not clear if we have managed to get the same +# results as 6v (could indicate that the larger frames-per-iter is not helpful? +# but I'd rather not decrease it as it would hurt speed). + +# local/chain/compare_wer.sh 6v 6y 6z +# System 6v 6y 6z +# WER on train_dev(tg) 15.00 15.36 15.18 +# WER on train_dev(fg) 13.91 14.19 14.06 +# WER on eval2000(tg) 17.2 17.2 17.2 +# WER on eval2000(fg) 15.7 15.8 15.6 +# Final train prob -0.105012 -0.102139 -0.106268 +# Final valid prob -0.125877 -0.119654 -0.126726 +# Final train prob (xent) -1.54736 -1.55598 -1.4556 +# Final valid prob (xent) -1.57475 -1.58821 -1.50136 + +# 6y is as 6w, but after fixing the config-generation script to use +# a higher learning-rate factor for the final xent layer (it was otherwise +# training too slowly). + +# 6w is as 6v (a new tdnn-based recipe), but using 1.5 million not 1.2 million +# frames per iter (and of course re-dumping the egs). + +# this is same as v2 script but with xent-regularization +# it has a different splicing configuration +set -e + +# configs for 'chain' +affix= +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_6z # Note: _sp will get added to this if $speed_perturb == true. +decode_iter= + +# TDNN options +# this script uses the new tdnn config generator so it needs a final 0 to reflect that the final layer input has no splicing +# smoothing options +pool_window= +pool_type='none' +pool_lpfilter_width= +self_repair_scale=0.00001 +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=2.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +relu_dim=576 +frames_per_eg=150 +remove_egs=false +common_egs_dir= +xent_regularize=0.1 + + + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + echo "$0: creating neural net configs"; + if [ ! -z "$relu_dim" ]; then + dim_opts="--relu-dim $relu_dim" + else + dim_opts="--pnorm-input-dim $pnorm_input_dim --pnorm-output-dim $pnorm_output_dim" + fi + + # create the config files for nnet initialization + pool_opts= + pool_opts=$pool_opts${pool_type:+" --pool-type $pool_type "} + pool_opts=$pool_opts${pool_window:+" --pool-window $pool_window "} + pool_opts=$pool_opts${pool_lpfilter_width:+" --pool-lpfilter-width $pool_lpfilter_width "} + repair_opts=${self_repair_scale:+" --self-repair-scale $self_repair_scale "} + + steps/nnet3/tdnn/make_configs.py $pool_opts \ + $repair_opts \ + --feat-dir data/${train_set}_hires \ + --ivector-dir exp/nnet3/ivectors_${train_set} \ + --tree-dir $treedir \ + $dim_opts \ + --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3 -3,0,3 -6,-3,0 0" \ + --use-presoftmax-prior-scale false \ + --xent-regularize $xent_regularize \ + --xent-separate-forward-affine true \ + --include-log-softmax false \ + --final-layer-normalize-target $final_layer_normalize_target \ + $dir/configs || exit 1; +fi + + + +if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.stage $get_egs_stage \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width $frames_per_eg \ + --trainer.num-chunk-per-minibatch $minibatch_size \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.num-jobs-initial $num_jobs_initial \ + --trainer.optimization.num-jobs-final $num_jobs_final \ + --trainer.optimization.initial-effective-lrate $initial_effective_lrate \ + --trainer.optimization.final-effective-lrate $final_effective_lrate \ + --trainer.max-param-change $max_param_change \ + --cleanup.remove-egs $remove_egs \ + --feat-dir data/${train_set}_hires \ + --tree-dir $treedir \ + --lat-dir exp/tri4_lats_nodup$suffix \ + --dir $dir || exit 1; + +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + iter_opts=" --iter $decode_iter " + fi + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj 50 --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_6z_l2.sh b/egs/swbd/s5c/local/chain/run_tdnn_6z_l2.sh new file mode 100755 index 00000000000..e1c8d263458 --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_6z_l2.sh @@ -0,0 +1,201 @@ +#!/bin/bash + +# same as 6z but with only l2 regularization +set -e + +# configs for 'chain' +affix= +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_6z_l2 # Note: _sp will get added to this if $speed_perturb == true. +decode_iter= + +# TDNN options +# this script uses the new tdnn config generator so it needs a final 0 to reflect that the final layer input has no splicing +# smoothing options +pool_window= +pool_type='none' +pool_lpfilter_width= +self_repair_scale=0.00001 +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=2.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +relu_dim=576 +frames_per_eg=150 +remove_egs=false +common_egs_dir=exp/chain/tdnn_6z_sp/egs + + + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + echo "$0: creating neural net configs"; + if [ ! -z "$relu_dim" ]; then + dim_opts="--relu-dim $relu_dim" + else + dim_opts="--pnorm-input-dim $pnorm_input_dim --pnorm-output-dim $pnorm_output_dim" + fi + + # create the config files for nnet initialization + pool_opts= + pool_opts=$pool_opts${pool_type:+" --pool-type $pool_type "} + pool_opts=$pool_opts${pool_window:+" --pool-window $pool_window "} + pool_opts=$pool_opts${pool_lpfilter_width:+" --pool-lpfilter-width $pool_lpfilter_width "} + repair_opts=${self_repair_scale:+" --self-repair-scale $self_repair_scale "} + + steps/nnet3/tdnn/make_configs.py $pool_opts \ + $repair_opts \ + --feat-dir data/${train_set}_hires \ + --ivector-dir exp/nnet3/ivectors_${train_set} \ + --tree-dir $treedir \ + $dim_opts \ + --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3 -3,0,3 -6,-3,0 0" \ + --use-presoftmax-prior-scale false \ + --include-log-softmax false \ + --final-layer-normalize-target $final_layer_normalize_target \ + $dir/configs || exit 1; +fi + + + +if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.dir "$common_egs_dir" \ + --egs.stage $get_egs_stage \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width $frames_per_eg \ + --trainer.num-chunk-per-minibatch $minibatch_size \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.num-jobs-initial $num_jobs_initial \ + --trainer.optimization.num-jobs-final $num_jobs_final \ + --trainer.optimization.initial-effective-lrate $initial_effective_lrate \ + --trainer.optimization.final-effective-lrate $final_effective_lrate \ + --trainer.max-param-change $max_param_change \ + --cleanup.remove-egs $remove_egs \ + --feat-dir data/${train_set}_hires \ + --tree-dir $treedir \ + --lat-dir exp/tri4_lats_nodup$suffix \ + --dir $dir || exit 1; + +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + iter_opts=" --iter $decode_iter " + fi + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj 50 --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_6z_l2_leaky.sh b/egs/swbd/s5c/local/chain/run_tdnn_6z_l2_leaky.sh new file mode 100755 index 00000000000..157ecb2d6f7 --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_6z_l2_leaky.sh @@ -0,0 +1,201 @@ +#!/bin/bash + +set -e + +# configs for 'chain' +affix= +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_6z_l2_leaky # Note: _sp will get added to this if $speed_perturb == true. +decode_iter= + +# TDNN options +# this script uses the new tdnn config generator so it needs a final 0 to reflect that the final layer input has no splicing +# smoothing options +pool_window= +pool_type='none' +pool_lpfilter_width= +self_repair_scale=0.00001 +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=2.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +relu_dim=576 +frames_per_eg=150 +remove_egs=false +common_egs_dir=exp/chain/tdnn_6z_sp/egs + + + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + echo "$0: creating neural net configs"; + if [ ! -z "$relu_dim" ]; then + dim_opts="--relu-dim $relu_dim" + else + dim_opts="--pnorm-input-dim $pnorm_input_dim --pnorm-output-dim $pnorm_output_dim" + fi + + # create the config files for nnet initialization + pool_opts= + pool_opts=$pool_opts${pool_type:+" --pool-type $pool_type "} + pool_opts=$pool_opts${pool_window:+" --pool-window $pool_window "} + pool_opts=$pool_opts${pool_lpfilter_width:+" --pool-lpfilter-width $pool_lpfilter_width "} + repair_opts=${self_repair_scale:+" --self-repair-scale $self_repair_scale "} + + steps/nnet3/tdnn/make_configs.py $pool_opts \ + $repair_opts \ + --feat-dir data/${train_set}_hires \ + --ivector-dir exp/nnet3/ivectors_${train_set} \ + --tree-dir $treedir \ + $dim_opts \ + --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3 -3,0,3 -6,-3,0 0" \ + --use-presoftmax-prior-scale false \ + --include-log-softmax false \ + --final-layer-normalize-target $final_layer_normalize_target \ + $dir/configs || exit 1; +fi + + + +if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.dir "$common_egs_dir" \ + --egs.stage $get_egs_stage \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width $frames_per_eg \ + --trainer.num-chunk-per-minibatch $minibatch_size \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.num-jobs-initial $num_jobs_initial \ + --trainer.optimization.num-jobs-final $num_jobs_final \ + --trainer.optimization.initial-effective-lrate $initial_effective_lrate \ + --trainer.optimization.final-effective-lrate $final_effective_lrate \ + --trainer.max-param-change $max_param_change \ + --cleanup.remove-egs $remove_egs \ + --feat-dir data/${train_set}_hires \ + --tree-dir $treedir \ + --lat-dir exp/tri4_lats_nodup$suffix \ + --dir $dir || exit 1; + +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + iter_opts=" --iter $decode_iter " + fi + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj 50 --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_6z_leaky.sh b/egs/swbd/s5c/local/chain/run_tdnn_6z_leaky.sh new file mode 100755 index 00000000000..3ac915142d3 --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_6z_leaky.sh @@ -0,0 +1,201 @@ +#!/bin/bash + +# same as 6z but with only l2 regularization +set -e + +# configs for 'chain' +affix= +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_6z_leaky # Note: _sp will get added to this if $speed_perturb == true. +decode_iter= + +# TDNN options +# this script uses the new tdnn config generator so it needs a final 0 to reflect that the final layer input has no splicing +# smoothing options +pool_window= +pool_type='none' +pool_lpfilter_width= +self_repair_scale=0.00001 +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=2.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +relu_dim=576 +frames_per_eg=150 +remove_egs=false +common_egs_dir=exp/chain/tdnn_6z_sp/egs + + + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + echo "$0: creating neural net configs"; + if [ ! -z "$relu_dim" ]; then + dim_opts="--relu-dim $relu_dim" + else + dim_opts="--pnorm-input-dim $pnorm_input_dim --pnorm-output-dim $pnorm_output_dim" + fi + + # create the config files for nnet initialization + pool_opts= + pool_opts=$pool_opts${pool_type:+" --pool-type $pool_type "} + pool_opts=$pool_opts${pool_window:+" --pool-window $pool_window "} + pool_opts=$pool_opts${pool_lpfilter_width:+" --pool-lpfilter-width $pool_lpfilter_width "} + repair_opts=${self_repair_scale:+" --self-repair-scale $self_repair_scale "} + + steps/nnet3/tdnn/make_configs.py $pool_opts \ + $repair_opts \ + --feat-dir data/${train_set}_hires \ + --ivector-dir exp/nnet3/ivectors_${train_set} \ + --tree-dir $treedir \ + $dim_opts \ + --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3 -3,0,3 -6,-3,0 0" \ + --use-presoftmax-prior-scale false \ + --include-log-softmax false \ + --final-layer-normalize-target $final_layer_normalize_target \ + $dir/configs || exit 1; +fi + + + +if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.stage $get_egs_stage \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width $frames_per_eg \ + --egs.dir "$common_egs_dir" \ + --trainer.num-chunk-per-minibatch $minibatch_size \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.num-jobs-initial $num_jobs_initial \ + --trainer.optimization.num-jobs-final $num_jobs_final \ + --trainer.optimization.initial-effective-lrate $initial_effective_lrate \ + --trainer.optimization.final-effective-lrate $final_effective_lrate \ + --trainer.max-param-change $max_param_change \ + --cleanup.remove-egs $remove_egs \ + --feat-dir data/${train_set}_hires \ + --tree-dir $treedir \ + --lat-dir exp/tri4_lats_nodup$suffix \ + --dir $dir || exit 1; + +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + iter_opts=" --iter $decode_iter " + fi + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj 50 --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_6z_none.sh b/egs/swbd/s5c/local/chain/run_tdnn_6z_none.sh new file mode 100755 index 00000000000..e9aa68c2dd7 --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_6z_none.sh @@ -0,0 +1,200 @@ +#!/bin/bash + +# same as 6z but with only l2 regularization +set -e + +# configs for 'chain' +affix= +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_6z_none # Note: _sp will get added to this if $speed_perturb == true. +decode_iter= + +# TDNN options +# this script uses the new tdnn config generator so it needs a final 0 to reflect that the final layer input has no splicing +# smoothing options +pool_window= +pool_type='none' +pool_lpfilter_width= +self_repair_scale=0.00001 +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=2.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +relu_dim=576 +frames_per_eg=150 +remove_egs=false +common_egs_dir=exp/chain/tdnn_6z_sp/egs + + + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + echo "$0: creating neural net configs"; + if [ ! -z "$relu_dim" ]; then + dim_opts="--relu-dim $relu_dim" + else + dim_opts="--pnorm-input-dim $pnorm_input_dim --pnorm-output-dim $pnorm_output_dim" + fi + + # create the config files for nnet initialization + pool_opts= + pool_opts=$pool_opts${pool_type:+" --pool-type $pool_type "} + pool_opts=$pool_opts${pool_window:+" --pool-window $pool_window "} + pool_opts=$pool_opts${pool_lpfilter_width:+" --pool-lpfilter-width $pool_lpfilter_width "} + repair_opts=${self_repair_scale:+" --self-repair-scale $self_repair_scale "} + + steps/nnet3/tdnn/make_configs.py $pool_opts \ + $repair_opts \ + --feat-dir data/${train_set}_hires \ + --ivector-dir exp/nnet3/ivectors_${train_set} \ + --tree-dir $treedir \ + $dim_opts \ + --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3 -3,0,3 -6,-3,0 0" \ + --use-presoftmax-prior-scale false \ + --include-log-softmax false \ + --final-layer-normalize-target $final_layer_normalize_target \ + $dir/configs || exit 1; +fi + + + +if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.stage $get_egs_stage \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width $frames_per_eg \ + --egs.dir "$common_egs_dir" \ + --trainer.num-chunk-per-minibatch $minibatch_size \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.num-jobs-initial $num_jobs_initial \ + --trainer.optimization.num-jobs-final $num_jobs_final \ + --trainer.optimization.initial-effective-lrate $initial_effective_lrate \ + --trainer.optimization.final-effective-lrate $final_effective_lrate \ + --trainer.max-param-change $max_param_change \ + --cleanup.remove-egs $remove_egs \ + --feat-dir data/${train_set}_hires \ + --tree-dir $treedir \ + --lat-dir exp/tri4_lats_nodup$suffix \ + --dir $dir || exit 1; + +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + iter_opts=" --iter $decode_iter " + fi + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj 50 --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_6z_nosubsampling.sh b/egs/swbd/s5c/local/chain/run_tdnn_6z_nosubsampling.sh new file mode 100755 index 00000000000..10d4ce3ce73 --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_6z_nosubsampling.sh @@ -0,0 +1,200 @@ +#!/bin/bash + +# same as 6z but with no frame-subsampling +# This is a simplification of the _6z script as we use the normal lang/ directory, +# we set frame-subsampling-factor and alignment-subsampling-factor to 1. +# it is at least 3 times slower than _6z, +# We increase the num-epochs, possibly by a factor of 3 [since there would only be one shift]. + + +set -e + +# configs for 'chain' +affix= +stage=11 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_6z_nosubsamp # Note: _sp will get added to this if $speed_perturb == true. +decode_iter= + +# TDNN options +# this script uses the new tdnn config generator so it needs a final 0 to reflect that the final layer input has no splicing +# smoothing options +pool_window= +pool_type='none' +pool_lpfilter_width= +self_repair_scale=0.00001 +# training options +num_epochs=12 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=1.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=32 +relu_dim=576 +frames_per_eg=150 +remove_egs=false +common_egs_dir= +xent_regularize=0.1 + + + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 1 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + echo "$0: creating neural net configs"; + if [ ! -z "$relu_dim" ]; then + dim_opts="--relu-dim $relu_dim" + else + dim_opts="--pnorm-input-dim $pnorm_input_dim --pnorm-output-dim $pnorm_output_dim" + fi + + # create the config files for nnet initialization + pool_opts= + pool_opts=$pool_opts${pool_type:+" --pool-type $pool_type "} + pool_opts=$pool_opts${pool_window:+" --pool-window $pool_window "} + pool_opts=$pool_opts${pool_lpfilter_width:+" --pool-lpfilter-width $pool_lpfilter_width "} + repair_opts=${self_repair_scale:+" --self-repair-scale $self_repair_scale "} + + steps/nnet3/tdnn/make_configs.py $pool_opts \ + $repair_opts \ + --feat-dir data/${train_set}_hires \ + --ivector-dir exp/nnet3/ivectors_${train_set} \ + --tree-dir $treedir \ + $dim_opts \ + --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3 -3,0,3 -6,-3,0 0" \ + --use-presoftmax-prior-scale false \ + --xent-regularize $xent_regularize \ + --xent-separate-forward-affine true \ + --include-log-softmax false \ + --final-layer-normalize-target $final_layer_normalize_target \ + $dir/configs || exit 1; +fi + + + +if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b{09,10,11,12}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --feat.cmvn-opts="--norm-means=false --norm-vars=false" \ + --chain.frame-subsampling-factor 1 \ + --chain.alignment-subsampling-factor 1 \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.0333 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.dir "$common_egs_dir" \ + --egs.stage $get_egs_stage \ + --egs.opts="--frames-overlap-per-eg 0" \ + --egs.chunk-width $frames_per_eg \ + --trainer.num-chunk-per-minibatch $minibatch_size \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.num-jobs-initial $num_jobs_initial \ + --trainer.optimization.num-jobs-final $num_jobs_final \ + --trainer.optimization.initial-effective-lrate $initial_effective_lrate \ + --trainer.optimization.final-effective-lrate $final_effective_lrate \ + --trainer.max-param-change $max_param_change \ + --cleanup.remove-egs $remove_egs \ + --feat-dir data/${train_set}_hires \ + --tree-dir $treedir \ + --lat-dir exp/tri4_lats_nodup$suffix \ + --dir $dir || exit 1; + +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + iter_opts=" --iter $decode_iter " + fi + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj 50 --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_6z_nosubsampling4.sh b/egs/swbd/s5c/local/chain/run_tdnn_6z_nosubsampling4.sh new file mode 100755 index 00000000000..6acba86b3af --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_6z_nosubsampling4.sh @@ -0,0 +1,226 @@ +#!/bin/bash + +# same as 6z but with no frame-subsampling +# This is a simplification of the _6z script as we use the normal lang/ directory, +# we set frame-subsampling-factor and alignment-subsampling-factor to 1. +# it is at least 3 times slower than _6z, +# We increase the num-epochs, possibly by a factor of 3 [since there would only be one shift]. + + +set -e + +# configs for 'chain' +affix= +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_6z_nosubsamp_100cw_lowreg # Note: _sp will get added to this if $speed_perturb == true. +decode_iter= + +# TDNN options +# this script uses the new tdnn config generator so it needs a final 0 to reflect that the final layer input has no splicing +# smoothing options +pool_window= +pool_type='none' +pool_lpfilter_width= +self_repair_scale=0.00001 +# training options +num_epochs=12 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=1.414 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=64 +relu_dim=576 +frames_per_eg=100 +remove_egs=false +common_egs_dir=exp/chain/tdnn_6z_nosubsamp_100cw_sp/egs +xent_regularize=0.0333 + + + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + echo "$0: creating neural net configs"; + if [ ! -z "$relu_dim" ]; then + dim_opts="--relu-dim $relu_dim" + else + dim_opts="--pnorm-input-dim $pnorm_input_dim --pnorm-output-dim $pnorm_output_dim" + fi + + # create the config files for nnet initialization + pool_opts= + pool_opts=$pool_opts${pool_type:+" --pool-type $pool_type "} + pool_opts=$pool_opts${pool_window:+" --pool-window $pool_window "} + pool_opts=$pool_opts${pool_lpfilter_width:+" --pool-lpfilter-width $pool_lpfilter_width "} + repair_opts=${self_repair_scale:+" --self-repair-scale $self_repair_scale "} + + steps/nnet3/tdnn/make_configs.py $pool_opts \ + $repair_opts \ + --feat-dir data/${train_set}_hires \ + --ivector-dir exp/nnet3/ivectors_${train_set} \ + --tree-dir $treedir \ + $dim_opts \ + --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3 -3,0,3 -6,-3,0 0" \ + --use-presoftmax-prior-scale false \ + --xent-regularize $xent_regularize \ + --xent-separate-forward-affine true \ + --include-log-softmax false \ + --final-layer-normalize-target $final_layer_normalize_target \ + $dir/configs || exit 1; +fi + + + +if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.stage $get_egs_stage \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width $frames_per_eg \ + --egs.dir "$common_egs_dir" \ + --trainer.num-chunk-per-minibatch $minibatch_size \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.num-jobs-initial $num_jobs_initial \ + --trainer.optimization.num-jobs-final $num_jobs_final \ + --trainer.optimization.initial-effective-lrate $initial_effective_lrate \ + --trainer.optimization.final-effective-lrate $final_effective_lrate \ + --trainer.max-param-change $max_param_change \ + --cleanup.remove-egs $remove_egs \ + --feat-dir data/${train_set}_hires \ + --tree-dir $treedir \ + --lat-dir exp/tri4_lats_nodup$suffix \ + --dir $dir || exit 1; + +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + iter_opts=" --iter $decode_iter " + fi + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj 50 --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_6z_xent_l2.sh b/egs/swbd/s5c/local/chain/run_tdnn_6z_xent_l2.sh new file mode 100755 index 00000000000..4c91a49ca90 --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_6z_xent_l2.sh @@ -0,0 +1,204 @@ +#!/bin/bash + +set -e + +# configs for 'chain' +affix= +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_6z_xent_l2 # Note: _sp will get added to this if $speed_perturb == true. +decode_iter= + +# TDNN options +# this script uses the new tdnn config generator so it needs a final 0 to reflect that the final layer input has no splicing +# smoothing options +pool_window= +pool_type='none' +pool_lpfilter_width= +self_repair_scale=0.00001 +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=2.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +relu_dim=576 +frames_per_eg=150 +remove_egs=false +xent_regularize=0.1 +common_egs_dir=exp/chain/tdnn_6z_sp/egs + + + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + echo "$0: creating neural net configs"; + if [ ! -z "$relu_dim" ]; then + dim_opts="--relu-dim $relu_dim" + else + dim_opts="--pnorm-input-dim $pnorm_input_dim --pnorm-output-dim $pnorm_output_dim" + fi + + # create the config files for nnet initialization + pool_opts= + pool_opts=$pool_opts${pool_type:+" --pool-type $pool_type "} + pool_opts=$pool_opts${pool_window:+" --pool-window $pool_window "} + pool_opts=$pool_opts${pool_lpfilter_width:+" --pool-lpfilter-width $pool_lpfilter_width "} + repair_opts=${self_repair_scale:+" --self-repair-scale $self_repair_scale "} + + steps/nnet3/tdnn/make_configs.py $pool_opts \ + $repair_opts \ + --feat-dir data/${train_set}_hires \ + --ivector-dir exp/nnet3/ivectors_${train_set} \ + --tree-dir $treedir \ + $dim_opts \ + --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3 -3,0,3 -6,-3,0 0" \ + --use-presoftmax-prior-scale false \ + --xent-regularize $xent_regularize \ + --xent-separate-forward-affine true \ + --include-log-softmax false \ + --final-layer-normalize-target $final_layer_normalize_target \ + $dir/configs || exit 1; +fi + + + +if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.dir "$common_egs_dir" \ + --egs.stage $get_egs_stage \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width $frames_per_eg \ + --trainer.num-chunk-per-minibatch $minibatch_size \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.num-jobs-initial $num_jobs_initial \ + --trainer.optimization.num-jobs-final $num_jobs_final \ + --trainer.optimization.initial-effective-lrate $initial_effective_lrate \ + --trainer.optimization.final-effective-lrate $final_effective_lrate \ + --trainer.max-param-change $max_param_change \ + --cleanup.remove-egs $remove_egs \ + --feat-dir data/${train_set}_hires \ + --tree-dir $treedir \ + --lat-dir exp/tri4_lats_nodup$suffix \ + --dir $dir || exit 1; + +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + iter_opts=" --iter $decode_iter " + fi + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj 50 --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_6z_xent_leaky.sh b/egs/swbd/s5c/local/chain/run_tdnn_6z_xent_leaky.sh new file mode 100755 index 00000000000..d9696a91795 --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_6z_xent_leaky.sh @@ -0,0 +1,204 @@ +#!/bin/bash + +set -e + +# configs for 'chain' +affix= +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_6z_xent_leaky # Note: _sp will get added to this if $speed_perturb == true. +decode_iter= + +# TDNN options +# this script uses the new tdnn config generator so it needs a final 0 to reflect that the final layer input has no splicing +# smoothing options +pool_window= +pool_type='none' +pool_lpfilter_width= +self_repair_scale=0.00001 +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=2.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +relu_dim=576 +frames_per_eg=150 +remove_egs=false +common_egs_dir=exp/chain/tdnn_6z_sp/egs +xent_regularize=0.1 + + + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + echo "$0: creating neural net configs"; + if [ ! -z "$relu_dim" ]; then + dim_opts="--relu-dim $relu_dim" + else + dim_opts="--pnorm-input-dim $pnorm_input_dim --pnorm-output-dim $pnorm_output_dim" + fi + + # create the config files for nnet initialization + pool_opts= + pool_opts=$pool_opts${pool_type:+" --pool-type $pool_type "} + pool_opts=$pool_opts${pool_window:+" --pool-window $pool_window "} + pool_opts=$pool_opts${pool_lpfilter_width:+" --pool-lpfilter-width $pool_lpfilter_width "} + repair_opts=${self_repair_scale:+" --self-repair-scale $self_repair_scale "} + + steps/nnet3/tdnn/make_configs.py $pool_opts \ + $repair_opts \ + --feat-dir data/${train_set}_hires \ + --ivector-dir exp/nnet3/ivectors_${train_set} \ + --tree-dir $treedir \ + $dim_opts \ + --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3 -3,0,3 -6,-3,0 0" \ + --use-presoftmax-prior-scale false \ + --xent-regularize $xent_regularize \ + --xent-separate-forward-affine true \ + --include-log-softmax false \ + --final-layer-normalize-target $final_layer_normalize_target \ + $dir/configs || exit 1; +fi + + + +if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.dir "$common_egs_dir" \ + --egs.stage $get_egs_stage \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width $frames_per_eg \ + --trainer.num-chunk-per-minibatch $minibatch_size \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.num-jobs-initial $num_jobs_initial \ + --trainer.optimization.num-jobs-final $num_jobs_final \ + --trainer.optimization.initial-effective-lrate $initial_effective_lrate \ + --trainer.optimization.final-effective-lrate $final_effective_lrate \ + --trainer.max-param-change $max_param_change \ + --cleanup.remove-egs $remove_egs \ + --feat-dir data/${train_set}_hires \ + --tree-dir $treedir \ + --lat-dir exp/tri4_lats_nodup$suffix \ + --dir $dir || exit 1; + +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + iter_opts=" --iter $decode_iter " + fi + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj 50 --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_7a.sh b/egs/swbd/s5c/local/chain/run_tdnn_7a.sh new file mode 100755 index 00000000000..95c3c9f4c24 --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_7a.sh @@ -0,0 +1,262 @@ +#!/bin/bash + +# 7a inherits from 6z (which is a TDNN+ReLU-based network with various small +# bugs hopefully fixed now), and from 6r, which is our most-successful +# double-frame-rate system. We're re-dumping the egs, because the egs used in +# 6r used right-tolerance=10, which turns out to have been a bug, and not a +# helpful one. + +# it is not better than 6z. +# local/chain/compare_wer.sh 6v 6z 7a +#System 6v 6z 7a +#WER on train_dev(tg) 15.00 15.18 15.05 +#WER on train_dev(fg) 13.91 14.06 14.10 +#WER on eval2000(tg) 17.2 17.2 17.3 +#WER on eval2000(fg) 15.7 15.6 15.7 +#Final train prob -0.105012 -0.106268 -0.110288 +#Final valid prob -0.125877 -0.126726 -0.127071 +#Final train prob (xent) -1.54736 -1.4556 -1.59569 +#Final valid prob (xent) -1.57475 -1.50136 -1.62312 + +# 6z is as 6y, but fixing the right-tolerance in the scripts to default to 5 (as +# the default is in the code), rather than the previous script default value of +# 10 which I seem to have added to the script around Feb 9th. + +# 6y is as 6w, but after fixing the config-generation script to use +# a higher learning-rate factor for the final xent layer (it was otherwise +# training too slowly). + +# 6w is as 6v (a new tdnn-based recipe), but using 1.5 million not 1.2 million +# frames per iter (and of course re-dumping the egs). + +# this is same as v2 script but with xent-regularization +# it has a different splicing configuration +set -e + +# configs for 'chain' +affix= +stage=14 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_7a # Note: _sp will get added to this if $speed_perturb == true. +decode_iter= + +# TDNN options +# this script uses the new tdnn config generator so it needs a final 0 to reflect that the final layer input has no splicing +# smoothing options +pool_window= +pool_type='none' +pool_lpfilter_width= +self_repair_scale=0.00001 +# training options +num_epochs=2 # use 2 not 4 epochs, as with the double-frame-rate input, we + # shift the input data in double the number of distinct ways + # on each epoch. +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=2.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +relu_dim=576 +remove_egs=false +common_egs_dir= +xent_regularize=0.1 + + + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +# Generate double-frame-rate version of the data. +if [ $stage -le 12 ]; then + mfccdir=mfcc + for dataset in eval2000 train_dev; do ## ${train_set}; do + utils/copy_data_dir.sh data/$dataset data/${dataset}_hires_dbl + steps/make_mfcc.sh --cmd "$train_cmd" --nj 30 --mfcc-config conf/mfcc_hires_dbl.conf \ + data/${dataset}_hires_dbl exp/make_hires_dbl/$dataset $mfccdir; + steps/compute_cmvn_stats.sh data/${dataset}_hires_dbl exp/make_hires_dbl/$dataset $mfccdir; + utils/fix_data_dir.sh data/${dataset}_hires_dbl # remove segments with problems + done +fi + +if [ $stage -le 13 ]; then + for dataset in eval2000 train_dev ${train_set}; do + mkdir -p exp/nnet3/ivectors_${dataset}_fake2 + cp exp/nnet3/ivectors_${dataset}/ivector_online.scp exp/nnet3/ivectors_${dataset}_fake2 + # verify that the old ivector_period was 10. + [ $(cat exp/nnet3/ivectors_${dataset}/ivector_period) -eq 10 ] || exit 1 + echo 20 > exp/nnet3/ivectors_${dataset}_fake2/ivector_period + done +fi + +if [ $stage -le 14 ]; then + echo "$0: creating neural net configs"; + if [ ! -z "$relu_dim" ]; then + dim_opts="--relu-dim $relu_dim" + else + dim_opts="--pnorm-input-dim $pnorm_input_dim --pnorm-output-dim $pnorm_output_dim" + fi + + # create the config files for nnet initialization + pool_opts= + pool_opts=$pool_opts${pool_type:+" --pool-type $pool_type "} + pool_opts=$pool_opts${pool_window:+" --pool-window $pool_window "} + pool_opts=$pool_opts${pool_lpfilter_width:+" --pool-lpfilter-width $pool_lpfilter_width "} + repair_opts=${self_repair_scale:+" --self-repair-scale $self_repair_scale "} + + steps/nnet3/tdnn/make_configs.py $pool_opts \ + $repair_opts \ + --feat-dir data/${train_set}_hires_dbl \ + --ivector-dir exp/nnet3/ivectors_${train_set}_fake2 \ + --tree-dir $treedir \ + $dim_opts \ + --splice-indexes "-1,0,1 -2,0,2 -4,-2,0,2 -6,0,6 -6,0,6 -12,-6,0 0" \ + --use-presoftmax-prior-scale false \ + --xent-regularize $xent_regularize \ + --xent-separate-forward-affine true \ + --include-log-softmax false \ + --final-layer-normalize-target $final_layer_normalize_target \ + $dir/configs || exit 1; +fi + + + +if [ $stage -le 15 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{7,11,12,13}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir exp/nnet3/ivectors_${train_set}_fake2 \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.frame-subsampling-factor 6 \ + --chain.alignment-subsampling-factor 3 \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.stage $get_egs_stage \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 300 \ + --trainer.num-chunk-per-minibatch $minibatch_size \ + --trainer.frames-per-iter 3000000 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.num-jobs-initial $num_jobs_initial \ + --trainer.optimization.num-jobs-final $num_jobs_final \ + --trainer.optimization.initial-effective-lrate $initial_effective_lrate \ + --trainer.optimization.final-effective-lrate $final_effective_lrate \ + --trainer.max-param-change $max_param_change \ + --cleanup.remove-egs $remove_egs \ + --feat-dir data/${train_set}_hires_dbl \ + --tree-dir $treedir \ + --lat-dir exp/tri4_lats_nodup$suffix \ + --dir $dir || exit 1; + + echo "0.005" > $dir/frame_shift # this lets the sclite decoding script know + # what the frame shift was, in seconds. +fi + +if [ $stage -le 16 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 17 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + iter_opts=" --iter $decode_iter " + fi + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj 50 --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set}_fake2 \ + $graph_dir data/${decode_set}_hires_dbl $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires_dbl \ + $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_7b.sh b/egs/swbd/s5c/local/chain/run_tdnn_7b.sh new file mode 100755 index 00000000000..4d138cc5da0 --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_7b.sh @@ -0,0 +1,246 @@ +#!/bin/bash + +# 7b is as 6z, but increasing the relu-dim slightly from 576 to 625. + +# there is very little change. looks like we were close to the optimum. +# local/chain/compare_wer.sh 6z 7b +# System 6z 7b +# WER on train_dev(tg) 15.18 15.15 +# WER on train_dev(fg) 14.06 14.19 +# WER on eval2000(tg) 17.2 17.2 +# WER on eval2000(fg) 15.6 15.5 +# Final train prob -0.106268 -0.102617 +# Final valid prob -0.126726 -0.126529 +# Final train prob (xent) -1.4556 -1.43802 +# Final valid prob (xent) -1.50136 -1.4964 + +# 6z is as 6y, but fixing the right-tolerance in the scripts to default to 5 (as +# the default is in the code), rather than the previous script default value of +# 10 which I seem to have added to the script around Feb 9th. +# definitely better than 6y- not clear if we have managed to get the same +# results as 6v (could indicate that the larger frames-per-iter is not helpful? +# but I'd rather not decrease it as it would hurt speed). + +# local/chain/compare_wer.sh 6v 6y 6z +# System 6v 6y 6z +# WER on train_dev(tg) 15.00 15.36 15.18 +# WER on train_dev(fg) 13.91 14.19 14.06 +# WER on eval2000(tg) 17.2 17.2 17.2 +# WER on eval2000(fg) 15.7 15.8 15.6 +# Final train prob -0.105012 -0.102139 -0.106268 +# Final valid prob -0.125877 -0.119654 -0.126726 +# Final train prob (xent) -1.54736 -1.55598 -1.4556 +# Final valid prob (xent) -1.57475 -1.58821 -1.50136 + +# 6y is as 6w, but after fixing the config-generation script to use +# a higher learning-rate factor for the final xent layer (it was otherwise +# training too slowly). + +# 6w is as 6v (a new tdnn-based recipe), but using 1.5 million not 1.2 million +# frames per iter (and of course re-dumping the egs). + +# this is same as v2 script but with xent-regularization +# it has a different splicing configuration +set -e + +# configs for 'chain' +affix= +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_7b # Note: _sp will get added to this if $speed_perturb == true. +decode_iter= + +# TDNN options +# this script uses the new tdnn config generator so it needs a final 0 to reflect that the final layer input has no splicing +# smoothing options +pool_window= +pool_type='none' +pool_lpfilter_width= +self_repair_scale=0.00001 +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=2.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +relu_dim=625 +frames_per_eg=150 +remove_egs=false +common_egs_dir= +xent_regularize=0.1 + + + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + echo "$0: creating neural net configs"; + if [ ! -z "$relu_dim" ]; then + dim_opts="--relu-dim $relu_dim" + else + dim_opts="--pnorm-input-dim $pnorm_input_dim --pnorm-output-dim $pnorm_output_dim" + fi + + # create the config files for nnet initialization + pool_opts= + pool_opts=$pool_opts${pool_type:+" --pool-type $pool_type "} + pool_opts=$pool_opts${pool_window:+" --pool-window $pool_window "} + pool_opts=$pool_opts${pool_lpfilter_width:+" --pool-lpfilter-width $pool_lpfilter_width "} + repair_opts=${self_repair_scale:+" --self-repair-scale $self_repair_scale "} + + steps/nnet3/tdnn/make_configs.py $pool_opts \ + $repair_opts \ + --feat-dir data/${train_set}_hires \ + --ivector-dir exp/nnet3/ivectors_${train_set} \ + --tree-dir $treedir \ + $dim_opts \ + --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3 -3,0,3 -6,-3,0 0" \ + --use-presoftmax-prior-scale false \ + --xent-regularize $xent_regularize \ + --xent-separate-forward-affine true \ + --include-log-softmax false \ + --final-layer-normalize-target $final_layer_normalize_target \ + $dir/configs || exit 1; +fi + + + +if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir exp/chain/tdnn_6z_sp/egs \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.stage $get_egs_stage \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width $frames_per_eg \ + --trainer.num-chunk-per-minibatch $minibatch_size \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.num-jobs-initial $num_jobs_initial \ + --trainer.optimization.num-jobs-final $num_jobs_final \ + --trainer.optimization.initial-effective-lrate $initial_effective_lrate \ + --trainer.optimization.final-effective-lrate $final_effective_lrate \ + --trainer.max-param-change $max_param_change \ + --cleanup.remove-egs $remove_egs \ + --feat-dir data/${train_set}_hires \ + --tree-dir $treedir \ + --lat-dir exp/tri4_lats_nodup$suffix \ + --dir $dir || exit 1; + +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + iter_opts=" --iter $decode_iter " + fi + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj 50 --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_7c.sh b/egs/swbd/s5c/local/chain/run_tdnn_7c.sh new file mode 100755 index 00000000000..05cb2148ba0 --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_7c.sh @@ -0,0 +1,246 @@ +#!/bin/bash + +# 7c is as 6z, but reducing the left and right tolerance from 5 to 4. + +# No clear difference. + +# I reran the scoring of train_dev for 6z because the scoring script +# has had a bug fixed. +# local/score.sh data/train_dev exp/chain/tdnn_6z_sp/graph_sw1_tg exp/chain/tdnn_6z_sp/decode_train_dev_sw1_tg +# local/score.sh data/train_dev exp/chain/tdnn_6z_sp/graph_sw1_tg exp/chain/tdnn_6z_sp/decode_train_dev_sw1_fsh_fg; +# local/chain/compare_wer.sh 6z 7c +# System 6z 7c +# WER on train_dev(tg) 14.88 14.89 +# WER on train_dev(fg) 13.66 13.69 +# WER on eval2000(tg) 17.2 17.2 +# WER on eval2000(fg) 15.6 15.5 +# Final train prob -0.106268 -0.107003 +# Final valid prob -0.126726 -0.133782 +# Final train prob (xent) -1.4556 -1.40549 +# Final valid prob (xent) -1.50136 -1.47833 + + + +# local/chain/compare_wer.sh 6v 6y 6z +# System 6v 6y 6z +# WER on train_dev(tg) 15.00 15.36 15.18 +# WER on train_dev(fg) 13.91 14.19 14.06 +# WER on eval2000(tg) 17.2 17.2 17.2 +# WER on eval2000(fg) 15.7 15.8 15.6 +# Final train prob -0.105012 -0.102139 -0.106268 +# Final valid prob -0.125877 -0.119654 -0.126726 +# Final train prob (xent) -1.54736 -1.55598 -1.4556 +# Final valid prob (xent) -1.57475 -1.58821 -1.50136 + +# 6y is as 6w, but after fixing the config-generation script to use +# a higher learning-rate factor for the final xent layer (it was otherwise +# training too slowly). + +# 6w is as 6v (a new tdnn-based recipe), but using 1.5 million not 1.2 million +# frames per iter (and of course re-dumping the egs). + +# this is same as v2 script but with xent-regularization +# it has a different splicing configuration +set -e + +# configs for 'chain' +affix= +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_7c # Note: _sp will get added to this if $speed_perturb == true. +decode_iter= + +# TDNN options +# this script uses the new tdnn config generator so it needs a final 0 to reflect that the final layer input has no splicing +# smoothing options +pool_window= +pool_type='none' +pool_lpfilter_width= +self_repair_scale=0.00001 +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=2.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +relu_dim=576 +frames_per_eg=150 +remove_egs=false +common_egs_dir= +xent_regularize=0.1 + + + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + echo "$0: creating neural net configs"; + if [ ! -z "$relu_dim" ]; then + dim_opts="--relu-dim $relu_dim" + else + dim_opts="--pnorm-input-dim $pnorm_input_dim --pnorm-output-dim $pnorm_output_dim" + fi + + # create the config files for nnet initialization + pool_opts= + pool_opts=$pool_opts${pool_type:+" --pool-type $pool_type "} + pool_opts=$pool_opts${pool_window:+" --pool-window $pool_window "} + pool_opts=$pool_opts${pool_lpfilter_width:+" --pool-lpfilter-width $pool_lpfilter_width "} + repair_opts=${self_repair_scale:+" --self-repair-scale $self_repair_scale "} + + steps/nnet3/tdnn/make_configs.py $pool_opts \ + $repair_opts \ + --feat-dir data/${train_set}_hires \ + --ivector-dir exp/nnet3/ivectors_${train_set} \ + --tree-dir $treedir \ + $dim_opts \ + --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3 -3,0,3 -6,-3,0 0" \ + --use-presoftmax-prior-scale false \ + --xent-regularize $xent_regularize \ + --xent-separate-forward-affine true \ + --include-log-softmax false \ + --final-layer-normalize-target $final_layer_normalize_target \ + $dir/configs || exit 1; +fi + + + +if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.left-tolerance 4 --chain.right-tolerance 4 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.stage $get_egs_stage \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width $frames_per_eg \ + --trainer.num-chunk-per-minibatch $minibatch_size \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.num-jobs-initial $num_jobs_initial \ + --trainer.optimization.num-jobs-final $num_jobs_final \ + --trainer.optimization.initial-effective-lrate $initial_effective_lrate \ + --trainer.optimization.final-effective-lrate $final_effective_lrate \ + --trainer.max-param-change $max_param_change \ + --cleanup.remove-egs $remove_egs \ + --feat-dir data/${train_set}_hires \ + --tree-dir $treedir \ + --lat-dir exp/tri4_lats_nodup$suffix \ + --dir $dir || exit 1; + +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + iter_opts=" --iter $decode_iter " + fi + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj 50 --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_7d.sh b/egs/swbd/s5c/local/chain/run_tdnn_7d.sh new file mode 100644 index 00000000000..d33755602bd --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_7d.sh @@ -0,0 +1,217 @@ +#!/bin/bash + +# 7d is as 7b, but changing the HMM context from triphone to left biphone. + +# Left biphone model turns out to be as good as triphone model. +# local/chain/compare_wer.sh 7b 7d +# System 7b 7d +# WER on train_dev(tg) 15.10 15.03 +# WER on train_dev(fg) 14.21 14.22 +# WER on eval2000(tg) 17.2 17.4 +# WER on eval2000(fg) 15.9 15.9 +# Final train prob -0.100551 -0.092629 +# Final valid prob -0.123914 -0.11354 +# Final train prob (xent) -1.43215 -1.27932 +# Final valid prob (xent) -1.46662 -1.33193 +# Real-time factor 0.918978 0.711695 + +set -e + +# configs for 'chain' +affix= +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_7d # Note: _sp will get added to this if $speed_perturb == true. +decode_iter= + +# TDNN options +# this script uses the new tdnn config generator so it needs a final 0 to reflect that the final layer input has no splicing +# smoothing options +pool_window= +pool_type='none' +pool_lpfilter_width= +self_repair_scale=0.00001 +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=2.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +relu_dim=625 +frames_per_eg=150 +remove_egs=false +common_egs_dir= +xent_regularize=0.1 + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. This is the critically different + # step compared with other recipes. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$train_cmd" 7000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + echo "$0: creating neural net configs"; + if [ ! -z "$relu_dim" ]; then + dim_opts="--relu-dim $relu_dim" + else + dim_opts="--pnorm-input-dim $pnorm_input_dim --pnorm-output-dim $pnorm_output_dim" + fi + + # create the config files for nnet initialization + pool_opts= + pool_opts=$pool_opts${pool_type:+" --pool-type $pool_type "} + pool_opts=$pool_opts${pool_window:+" --pool-window $pool_window "} + pool_opts=$pool_opts${pool_lpfilter_width:+" --pool-lpfilter-width $pool_lpfilter_width "} + repair_opts=${self_repair_scale:+" --self-repair-scale $self_repair_scale "} + + steps/nnet3/tdnn/make_configs.py $pool_opts \ + $repair_opts \ + --feat-dir data/${train_set}_hires \ + --ivector-dir exp/nnet3/ivectors_${train_set} \ + --tree-dir $treedir \ + $dim_opts \ + --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3 -3,0,3 -6,-3,0 0" \ + --use-presoftmax-prior-scale false \ + --xent-regularize $xent_regularize \ + --xent-separate-forward-affine true \ + --include-log-softmax false \ + --final-layer-normalize-target $final_layer_normalize_target \ + $dir/configs || exit 1; +fi + + + +if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.stage $get_egs_stage \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width $frames_per_eg \ + --trainer.num-chunk-per-minibatch $minibatch_size \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.num-jobs-initial $num_jobs_initial \ + --trainer.optimization.num-jobs-final $num_jobs_final \ + --trainer.optimization.initial-effective-lrate $initial_effective_lrate \ + --trainer.optimization.final-effective-lrate $final_effective_lrate \ + --trainer.max-param-change $max_param_change \ + --cleanup.remove-egs $remove_egs \ + --feat-dir data/${train_set}_hires \ + --tree-dir $treedir \ + --lat-dir exp/tri4_lats_nodup$suffix \ + --dir $dir || exit 1; + +fi + +if [ $stage -le 14 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --left-biphone --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 15 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + iter_opts=" --iter $decode_iter " + fi + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj 50 --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_a.sh b/egs/swbd/s5c/local/chain/run_tdnn_a.sh new file mode 100755 index 00000000000..d77cb4a518a --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_a.sh @@ -0,0 +1,147 @@ +#!/bin/bash + +# caution: the egs for this were dumped with a bug in the numerator lattices, +# you can subtract 0.0152 from the likelihoods to correct for this. (compare +# exp/chain/tdnn_a_sp/log/compute_prob_valid.final.log.new and +# exp/chain/tdnn_a_sp/log/compute_prob_valid.final.log for an explanation). + +set -e + +# configs for 'chain' +stage=9 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_a # Note: _sp will get added to this if $speed_perturb == true. + + +# TDNN options +splice_indexes="-2,-1,0,1,2 -1,2 -3,3 -7,2 0" + +# training options +num_epochs=4 +initial_effective_lrate=0.0002 +final_effective_lrate=0.00002 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=256 +frames_per_eg=75 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 10 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --cmd "$train_cmd" 5000 data/$train_set data/lang_chain $ali_dir $treedir +fi + +if [ $stage -le 11 ]; then + # Get the alignments as lattices (gives the CTC training more freedom). + # use the same num-jobs as the alignments + nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1; + steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \ + data/lang exp/tri4 exp/tri4_lats_nodup$suffix + rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{1,2,3,4}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --get-egs-stage $get_egs_stage \ + --left-deriv-truncate 5 --right-deriv-truncate 5 --right-tolerance 5 \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 10" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --splice-indexes "$splice_indexes" \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --relu-dim 1024 \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --transition-scale 0.0 \ + --self-loop-scale 0.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_a2.sh b/egs/swbd/s5c/local/chain/run_tdnn_a2.sh new file mode 100755 index 00000000000..0289505f593 --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_a2.sh @@ -0,0 +1,146 @@ +#!/bin/bash + + + +set -e + +# configs for 'chain' +stage=9 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_a2 # Note: _sp will get added to this if $speed_perturb == true. + + +# TDNN options +splice_indexes="-2,-1,0,1,2 -1,2 -3,3 -7,2 0" + +# training options +num_epochs=4 +initial_effective_lrate=0.0002 +final_effective_lrate=0.00002 +max_param_change=1.0 # match the way the code was when we first ran this +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=256 +frames_per_eg=75 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 10 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --cmd "$train_cmd" 5000 data/$train_set data/lang_chain $ali_dir $treedir +fi + +if [ $stage -le 11 ]; then + # Get the alignments as lattices (gives the CTC training more freedom). + # use the same num-jobs as the alignments + nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1; + steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \ + data/lang exp/tri4 exp/tri4_lats_nodup$suffix + rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{1,2,3,4}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --get-egs-stage $get_egs_stage \ + --left-deriv-truncate 5 --right-deriv-truncate 5 --right-tolerance 5 \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 10" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --splice-indexes "$splice_indexes" \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --relu-dim 1024 \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --transition-scale 0.0 \ + --self-loop-scale 0.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_b.sh b/egs/swbd/s5c/local/chain/run_tdnn_b.sh new file mode 100755 index 00000000000..3929527171c --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_b.sh @@ -0,0 +1,149 @@ +#!/bin/bash + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=10 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_b # Note: _sp will get added to this if $speed_perturb == true. + +# TDNN options +splice_indexes="-2,-1,0,1,2 -1,2 -3,3 -7,2 0" + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=256 +frames_per_eg=75 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --cmd "$train_cmd" 12000 data/$train_set data/lang_chain $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{1,2,3,4}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --get-egs-stage $get_egs_stage \ + --left-deriv-truncate 5 --right-deriv-truncate 5 --right-tolerance 5 \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 10" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --splice-indexes "$splice_indexes" \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --final-layer-normalize-target $final_layer_normalize_target \ + --relu-dim 1024 \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --transition-scale 0.0 \ + --self-loop-scale 0.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_c.sh b/egs/swbd/s5c/local/chain/run_tdnn_c.sh new file mode 100755 index 00000000000..e7f7c756a08 --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_c.sh @@ -0,0 +1,157 @@ +#!/bin/bash + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. +# also setting max-param-change=1, which it seems is what the 'a' one was run with +# (it was the default in the code at that time). + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=10 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_c # Note: _sp will get added to this if $speed_perturb == true. + +# TDNN options +splice_indexes="-2,-1,0,1,2 -1,2 -3,3 -7,2 0" + +# training options +num_epochs=4 +initial_effective_lrate=0.0002 +final_effective_lrate=0.00002 +max_param_change=1.0 +final_layer_normalize_target=1.0 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=256 +frames_per_eg=75 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --cmd "$train_cmd" 7000 data/$train_set data/lang_chain $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{1,2,3,4}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --get-egs-stage $get_egs_stage \ + --left-deriv-truncate 5 --right-deriv-truncate 5 --right-tolerance 5 \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 10" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --splice-indexes "$splice_indexes" \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --final-layer-normalize-target $final_layer_normalize_target \ + --relu-dim 1024 \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --transition-scale 0.0 \ + --self-loop-scale 0.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_d.sh b/egs/swbd/s5c/local/chain/run_tdnn_d.sh new file mode 100755 index 00000000000..fa103660f69 --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_d.sh @@ -0,0 +1,161 @@ +#!/bin/bash + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=10 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_d # Note: _sp will get added to this if $speed_perturb == true. + +# TDNN options +splice_indexes="-2,-1,0,1,2 -1,2 -3,3 -7,2 0" + +# training options +num_epochs=4 +initial_effective_lrate=0.0002 +final_effective_lrate=0.00002 +max_param_change=1.0 +final_layer_normalize_target=1.0 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=256 +frames_per_eg=75 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --cmd "$train_cmd" 8000 data/$train_set data/lang_chain_d $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + # adding --target-num-history-states 500 to match the egs of run_lstm_a.sh. The + # script must have had a different default at that time. + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --get-egs-stage $get_egs_stage \ + --left-deriv-truncate 5 --right-deriv-truncate 5 --right-tolerance 5 \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 10" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --splice-indexes "$splice_indexes" \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --final-layer-normalize-target $final_layer_normalize_target \ + --relu-dim 1024 \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --transition-scale 0.0 \ + --self-loop-scale 0.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_e.sh b/egs/swbd/s5c/local/chain/run_tdnn_e.sh new file mode 100755 index 00000000000..3d6aef09224 --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_e.sh @@ -0,0 +1,167 @@ +#!/bin/bash + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=10 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_e # Note: _sp will get added to this if $speed_perturb == true. + +# TDNN options +splice_indexes="-2,-1,0,1,2 -1,2 -3,3 -7,2 0" + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +max_param_change=1.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=256 +frames_per_eg=75 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --cmd "$train_cmd" 12000 data/$train_set data/lang_chain_d $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + # adding --target-num-history-states 500 to match the egs of run_lstm_a.sh. The + # script must have had a different default at that time. + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --get-egs-stage $get_egs_stage \ + --left-deriv-truncate 5 --right-deriv-truncate 5 --right-tolerance 5 \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 10" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --splice-indexes "$splice_indexes" \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --final-layer-normalize-target $final_layer_normalize_target \ + --relu-dim 1024 \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --transition-scale 0.0 \ + --self-loop-scale 0.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_f.sh b/egs/swbd/s5c/local/chain/run_tdnn_f.sh new file mode 100755 index 00000000000..22e4de418c7 --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_f.sh @@ -0,0 +1,172 @@ +#!/bin/bash + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=10 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_f # Note: _sp will get added to this if $speed_perturb == true. + +# TDNN options +splice_indexes="-2,-1,0,1,2 -1,2 -3,3 -7,2 0" + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=30 +max_param_change=1.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=256 +frames_per_eg=75 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 12000 data/$train_set data/lang_chain_d $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{1,2,3,4}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + # adding --target-num-history-states 500 to match the egs of run_lstm_a.sh. The + # script must have had a different default at that time. + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --get-egs-stage $get_egs_stage \ + --left-deriv-truncate 5 --right-deriv-truncate 5 --right-tolerance 5 \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 10" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --splice-indexes "$splice_indexes" \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --final-layer-normalize-target $final_layer_normalize_target \ + --relu-dim 1024 \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --transition-scale 0.0 \ + --self-loop-scale 0.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_g.sh b/egs/swbd/s5c/local/chain/run_tdnn_g.sh new file mode 100755 index 00000000000..aed6401e230 --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_g.sh @@ -0,0 +1,174 @@ +#!/bin/bash + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_g # Note: _sp will get added to this if $speed_perturb == true. + +# TDNN options +splice_indexes="-2,-1,0,1,2 -1,2 -3,3 -9,0,9 0" + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=30 +max_param_change=1.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=256 +frames_per_eg=75 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 12000 data/$train_set data/lang_chain_d $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{1,2,3,4}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + # adding --target-num-history-states 500 to match the egs of run_lstm_a.sh. The + # script must have had a different default at that time. + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --get-egs-stage $get_egs_stage \ + --left-deriv-truncate 5 --right-deriv-truncate 5 --right-tolerance 5 \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 10 --nj 40" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --splice-indexes "$splice_indexes" \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --final-layer-normalize-target $final_layer_normalize_target \ + --relu-dim 1024 \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --transition-scale 0.0 \ + --self-loop-scale 0.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_h.sh b/egs/swbd/s5c/local/chain/run_tdnn_h.sh new file mode 100755 index 00000000000..b3917ac9a2c --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_h.sh @@ -0,0 +1,188 @@ +#!/bin/bash + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) +# The WER is quite a bit worse. +# b01:s5c: grep Sum exp/chain/tdnn_g_sp/decode_eval2000_sw1_fsh_fg/score*/*ys | utils/best_wer.sh +# %WER 13.1 | 1831 21395 | 88.6 8.1 3.4 1.7 13.1 50.0 | exp/chain/tdnn_g_sp/decode_eval2000_sw1_fsh_fg/score_11_0.5/eval2000_hires.ctm.swbd.filt.sys +# b01:s5c: grep Sum exp/chain/tdnn_h_sp/decode_eval2000_sw1_fsh_fg/score*/*ys | utils/best_wer.sh +# %WER 14.9 | 1831 21395 | 87.1 9.0 3.9 2.0 14.9 52.3 | exp/chain/tdnn_h_sp/decode_eval2000_sw1_fsh_fg/score_14_0.0/eval2000_hires.ctm.swbd.filt.sys + +# the train objf is a bit better. The valid objf is about the same but can't really be trusted as +# we had the bug where there was no utt2uniq file. +# exp/chain/tdnn_h_sp/log/compute_prob_train.final.log:LOG (nnet3-chain-compute-prob:PrintTotalStats():nnet-chain-diagnostics.cc:131) Overall log-probability for 'output' is -0.0788236 per frame, over 10000 frames. +#exp/chain/tdnn_g_sp/log/compute_prob_train.final.log:LOG (nnet3-chain-compute-prob:PrintTotalStats():nnet-chain-diagnostics.cc:131) Overall log-probability for 'output' is -0.08124 per frame, over 10000 frames. + + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_h # Note: _sp will get added to this if $speed_perturb == true. + +# TDNN options +splice_indexes="-2,-1,0,1,2 -1,2 -3,3 -9,0,9 0" + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=30 +max_param_change=0.3333 +scale_max_param_change=true +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=256 +frames_per_eg=75 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 12000 data/$train_set data/lang_chain_d $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{1,2,3,4}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --egs-dir exp/chain/tdnn_g_sp/egs \ + --get-egs-stage $get_egs_stage \ + --left-deriv-truncate 5 --right-deriv-truncate 5 --right-tolerance 5 \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 10 --nj 40" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --splice-indexes "$splice_indexes" \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --scale-max-param-change $scale_max_param_change \ + --final-layer-normalize-target $final_layer_normalize_target \ + --relu-dim 1024 \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --transition-scale 0.0 \ + --self-loop-scale 0.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_i.sh b/egs/swbd/s5c/local/chain/run_tdnn_i.sh new file mode 100755 index 00000000000..9519ecc2789 --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_i.sh @@ -0,0 +1,182 @@ +#!/bin/bash + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. +# be cautious comparing the valid probs with h though, because +# we fixed the utt2uniq bug at this point, so from h on, the valid probs +# are properly held out. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_i # Note: _sp will get added to this if $speed_perturb == true. + +# TDNN options +splice_indexes="-2,-1,0,1,2 -1,2 -3,3 -9,0,9 0" + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=30 +max_param_change=0.3333 +scale_max_param_change=true +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 12000 data/$train_set data/lang_chain_d $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{1,2,3,4}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --get-egs-stage $get_egs_stage \ + --left-deriv-truncate 5 --right-deriv-truncate 5 --right-tolerance 5 \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 10 --nj 40" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --splice-indexes "$splice_indexes" \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --scale-max-param-change $scale_max_param_change \ + --final-layer-normalize-target $final_layer_normalize_target \ + --relu-dim 1024 \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --transition-scale 0.0 \ + --self-loop-scale 0.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_j.sh b/egs/swbd/s5c/local/chain/run_tdnn_j.sh new file mode 100755 index 00000000000..8b1ff96ae5f --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_j.sh @@ -0,0 +1,189 @@ +#!/bin/bash + +# _j is as _i and using the same egs, but setting +# --left-deriv-truncate and --right-deriv-truncate to 10 +# instead of 5. +# This does not seem to be helpful at all: WERs are the same or even worse. With +# the trigram model and evaluating on all of eval2000, the WER with the 'i' +# model is 21.1, and of this model is 21.3. +# However, it probably would have made sense to set --frames-overlap-per-eg +# to a larger number - at least 20 - in this setup. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_j # Note: _sp will get added to this if $speed_perturb == true. + +# TDNN options +splice_indexes="-2,-1,0,1,2 -1,2 -3,3 -9,0,9 0" + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=30 +max_param_change=0.3333 +scale_max_param_change=true +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 12000 data/$train_set data/lang_chain_d $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{1,2,3,4}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --egs-dir exp/chain/tdnn_i_sp/egs \ + --get-egs-stage $get_egs_stage \ + --left-deriv-truncate 10 --right-deriv-truncate 10 --right-tolerance 5 \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 10 --nj 40" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --splice-indexes "$splice_indexes" \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --scale-max-param-change $scale_max_param_change \ + --final-layer-normalize-target $final_layer_normalize_target \ + --relu-dim 1024 \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --transition-scale 0.0 \ + --self-loop-scale 0.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_k.sh b/egs/swbd/s5c/local/chain/run_tdnn_k.sh new file mode 100755 index 00000000000..2393ddaffbb --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_k.sh @@ -0,0 +1,184 @@ +#!/bin/bash + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. +# [only] after 4gram rescoring, only 0.1% better than _i. :-( +# %WER 12.7 | 1831 21395 | 89.0 7.8 3.3 1.7 12.7 49.2 | exp/chain/tdnn_k_sp/decode_eval2000_sw1_fsh_fg/score_12_1.0/eval2000_hires.ctm.swbd.filt.sys +# %WER 12.8 | 1831 21395 | 88.8 7.8 3.4 1.6 12.8 49.3 | exp/chain/tdnn_i_sp/decode_eval2000_sw1_fsh_fg/score_14_0.0/eval2000_hires.ctm.swbd.filt.sys + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_k # Note: _sp will get added to this if $speed_perturb == true. + +# TDNN options +splice_indexes="-2,-1,0,1,2 -1,2 -3,3 -9,0,9 0" + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=30 +max_param_change=1.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 12000 data/$train_set data/lang_chain_d $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{1,2,3,4}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --egs-dir exp/chain/tdnn_i_sp/egs \ + --get-egs-stage $get_egs_stage \ + --left-deriv-truncate 5 --right-deriv-truncate 5 --right-tolerance 5 \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 10 --nj 40" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --splice-indexes "$splice_indexes" \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --final-layer-normalize-target $final_layer_normalize_target \ + --relu-dim 1024 \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --transition-scale 0.0 \ + --self-loop-scale 0.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_l.sh b/egs/swbd/s5c/local/chain/run_tdnn_l.sh new file mode 100755 index 00000000000..1c7f431d4ec --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_l.sh @@ -0,0 +1,188 @@ +#!/bin/bash + +# _l is as _k but even longer chunk size: 200 instead of 150. having to halve +# minibatch size to save memory. I correspondingly changed max-param-change. +# ... perhaps very slightly better than k: after 4-gram rescoring, looking at the +# whole of the eval2000 dataset we get improvement 18.9->18.7, but before +# 4-gram rescoring there is no change (20.7). +# on the Swbd subset the improvement is 0.1% before rescoring (14.3->14.2), +# and 0.3% after rescoring (12.7 -> 12.4). + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_l # Note: _sp will get added to this if $speed_perturb == true. + +# TDNN options +splice_indexes="-2,-1,0,1,2 -1,2 -3,3 -9,0,9 0" + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=30 +max_param_change=0.666 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=64 +frames_per_eg=200 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 12000 data/$train_set data/lang_chain_d $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --get-egs-stage $get_egs_stage \ + --left-deriv-truncate 5 --right-deriv-truncate 5 --right-tolerance 5 \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 10 --nj 40" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --splice-indexes "$splice_indexes" \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --final-layer-normalize-target $final_layer_normalize_target \ + --relu-dim 1024 \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --transition-scale 0.0 \ + --self-loop-scale 0.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_m.sh b/egs/swbd/s5c/local/chain/run_tdnn_m.sh new file mode 100755 index 00000000000..9d29447f78c --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_m.sh @@ -0,0 +1,189 @@ +#!/bin/bash + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_m # Note: _sp will get added to this if $speed_perturb == true. + +# TDNN options +splice_indexes="-2,-1,0,1,2 -1,2 -3,3 -9,0,9 0" + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=30 +max_param_change=1.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 12000 data/$train_set data/lang_chain_d $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --get-egs-stage $get_egs_stage \ + --left-deriv-truncate 5 --right-deriv-truncate 5 --right-tolerance 5 \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 10 --nj 40" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --splice-indexes "$splice_indexes" \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --final-layer-normalize-target $final_layer_normalize_target \ + --relu-dim 1024 \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --transition-scale 0.0 \ + --self-loop-scale 0.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_n.sh b/egs/swbd/s5c/local/chain/run_tdnn_n.sh new file mode 100755 index 00000000000..78029e7161f --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_n.sh @@ -0,0 +1,199 @@ +#!/bin/bash + +# _n is as _m but changing the egs configuration to get better and more even +# coverage of the data: increasing frames_per_eg from 150 to 200, +# and increasing --frames-overlap-per-eg from 10 to 30. +# I am also testing out some script changes in the get_egs.sh script that +# aims to reduce the number of small files (and some accompanying code changes +# that allow us to put the CPU-intensive phase of egs preparation with the +# 'shuffle' jobs). +# +# This doesn't seem to have made any consistent difference at all (although on +# average the change was slightly beneficial): on all of eval2000, m->n changed +# 20.9->20.8 with trigram and 18.6->18.7 after 4g rescoring; on train_dev it +# changed 19.31->19.04 with trigram, and 17.58->17.45 after 4g rescoring. + + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_n # Note: _sp will get added to this if $speed_perturb == true. + +# TDNN options +splice_indexes="-2,-1,0,1,2 -1,2 -3,3 -9,0,9 0" + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=30 +max_param_change=1.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=200 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 12000 data/$train_set data/lang_chain_d $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --get-egs-stage $get_egs_stage \ + --left-deriv-truncate 5 --right-deriv-truncate 5 --right-tolerance 5 \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 30" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --splice-indexes "$splice_indexes" \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --final-layer-normalize-target $final_layer_normalize_target \ + --relu-dim 1024 \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --transition-scale 0.0 \ + --self-loop-scale 0.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_o.sh b/egs/swbd/s5c/local/chain/run_tdnn_o.sh new file mode 100755 index 00000000000..8085c3a80fe --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_o.sh @@ -0,0 +1,226 @@ +#!/bin/bash + +# _o is as _n, but reducing the number of parameters to try to reduce +# over-training: reducing relu-dim from 1024 to 850 and target num-states +# from 12k to 9k. Also modifying the splicing setup in a way that shouldn't +# affect num-params, from "-2,-1,0,1,2 -1,2 -3,3 -9,0,9 0" to +# "-2,-1,0,1,2 -1,2 -3,3 -6,3 -6,3". +# +# There seems to be a slight improvement: on train_dev, WER changes 19.04->18.99 before +# rescoring, and 17.45->17.29 after. On all of eval2000 the WER changes +# from 20.8->20.6 before fg rescoring, and 18.7->18.5 after. + +# _n is as _m but changing the egs configuration to get better and more even +# coverage of the data: increasing frames_per_eg from 150 to 200, +# and increasing --frames-overlap-per-eg from 10 to 30. +# I am also testing out some script changes in the get_egs.sh script that +# aims to reduce the number of small files (and some accompanying code changes +# that allow us to put the CPU-intensive phase of egs preparation with the +# 'shuffle' jobs). + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=11 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_o # Note: _sp will get added to this if $speed_perturb == true. + +# TDNN options +splice_indexes="-2,-1,0,1,2 -1,2 -3,3 -6,3 -6,3" + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=30 +max_param_change=1.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=200 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set data/lang_chain_d $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --get-egs-stage $get_egs_stage \ + --left-deriv-truncate 5 --right-deriv-truncate 5 --right-tolerance 5 \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 30" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --splice-indexes "$splice_indexes" \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --final-layer-normalize-target $final_layer_normalize_target \ + --relu-dim 850 \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --transition-scale 0.0 \ + --self-loop-scale 0.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi + +# Running another decode with tighter beam. +# time is about twice faster-- easily within real-time even on fairly old machines. +# degradation on eval2000 is 14.2->14.4 before rescoring and 12.2->12.5 after; +# on train_dev is's 18.99->19.09 before rescoring, and 17.29->17.55 after. Probably +# the greater degradation after rescoring is due to the lattice-beam being too tight, +# which might not even affect the speed much (could easily make it 7.0). +if [ $stage -le 15 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --beam 11.0 --lattice-beam 6.0 --acwt 1.0 --post-decode-acwt 10.0 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff}_11_6 || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg}_11_6 || exit 1; + fi + ) & + done +fi + +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_p.sh b/egs/swbd/s5c/local/chain/run_tdnn_p.sh new file mode 100755 index 00000000000..97bb6dfbfc0 --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_p.sh @@ -0,0 +1,196 @@ +#!/bin/bash + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_p # Note: _sp will get added to this if $speed_perturb == true. + +# TDNN options +splice_indexes="-2,-1,0,1,2 -1,2 -3,3 -9,0,9 0" + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=30 +max_param_change=1.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 12000 data/$train_set data/lang_chain_d $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{1,2,3,4}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 30" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --splice-indexes "$splice_indexes" \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --final-layer-normalize-target $final_layer_normalize_target \ + --relu-dim 1024 \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --transition-scale 0.0 \ + --self-loop-scale 0.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_q.sh b/egs/swbd/s5c/local/chain/run_tdnn_q.sh new file mode 100755 index 00000000000..70274105c93 --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_q.sh @@ -0,0 +1,206 @@ +#!/bin/bash + + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. +# This reduction in parameters seems to be helpful: on train_dev (fg), +# change is 18.45 -> 18.07, and on all of eval2000 (fg), from 20.0 -> 19.8. + + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This will. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_q # Note: _sp will get added to this if $speed_perturb == true. + +# TDNN options +splice_indexes="-2,-1,0,1,2 -1,2 -3,3 -6,3 -6,3" + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=30 +max_param_change=1.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set data/lang_chain_d $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{1,2,3,4}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 30" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --splice-indexes "$splice_indexes" \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --final-layer-normalize-target $final_layer_normalize_target \ + --relu-dim 850 \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --transition-scale 0.0 \ + --self-loop-scale 0.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_r.sh b/egs/swbd/s5c/local/chain/run_tdnn_r.sh new file mode 100755 index 00000000000..3dcb1311db4 --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_r.sh @@ -0,0 +1,206 @@ +#!/bin/bash + +# _r is as _q except adding --lm-opts "--num-extra-states=0" +# to reduce the size of the phone LM. Not really expecting much difference + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_r # Note: _sp will get added to this if $speed_perturb == true. + +# TDNN options +splice_indexes="-2,-1,0,1,2 -1,2 -3,3 -6,3 -6,3" + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=30 +max_param_change=1.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set data/lang_chain_d $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{1,2,3,4}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --lm-opts "--num-extra-states=0" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 30" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --splice-indexes "$splice_indexes" \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --final-layer-normalize-target $final_layer_normalize_target \ + --relu-dim 850 \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --transition-scale 0.0 \ + --self-loop-scale 0.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_s.sh b/egs/swbd/s5c/local/chain/run_tdnn_s.sh new file mode 100755 index 00000000000..7ee23833fc9 --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_s.sh @@ -0,0 +1,208 @@ +#!/bin/bash + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_s # Note: _sp will get added to this if $speed_perturb == true. + +# TDNN options +splice_indexes="-2,-1,0,1,2 -1,2 -3,3 -6,3 -6,3" + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=30 +max_param_change=1.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set data/lang_chain_d $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{1,2,3,4}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --pdf-boundary-penalty 0.0 \ + --egs-dir exp/chain/tdnn_q_sp/egs \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 30" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --splice-indexes "$splice_indexes" \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --final-layer-normalize-target $final_layer_normalize_target \ + --relu-dim 850 \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --transition-scale 0.0 \ + --self-loop-scale 0.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_t.sh b/egs/swbd/s5c/local/chain/run_tdnn_t.sh new file mode 100755 index 00000000000..8b5805093e2 --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_t.sh @@ -0,0 +1,211 @@ +#!/bin/bash + + +# _t is as _s but setting pdf-boundary-penalty to 2.0 +# This makes things slightly worse: 18.0->18.2 on eval2000 after fg rescoring (20.1->20.1 before) +# and 16.96->17.26 on train_dev after fg rescoring (18.45->18.68 before). + +# _s is as _q but setting pdf-boundary-penalty to 0.0 + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_t # Note: _sp will get added to this if $speed_perturb == true. + +# TDNN options +splice_indexes="-2,-1,0,1,2 -1,2 -3,3 -6,3 -6,3" + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=30 +max_param_change=1.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set data/lang_chain_d $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{1,2,3,4}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --pdf-boundary-penalty 2.0 \ + --egs-dir exp/chain/tdnn_q_sp/egs \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 30" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --splice-indexes "$splice_indexes" \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --final-layer-normalize-target $final_layer_normalize_target \ + --relu-dim 850 \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --transition-scale 0.0 \ + --self-loop-scale 0.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_u.sh b/egs/swbd/s5c/local/chain/run_tdnn_u.sh new file mode 100755 index 00000000000..62470d31068 --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_u.sh @@ -0,0 +1,216 @@ +#!/bin/bash + +# _u is as _t but also setting --truncate-deriv-weights 3. +# This doesn't seem to be helpful, or at least inconsistent: 18.2->18.6 on all of eval2000 +# after fg rescoring (20.1->20.7 before); on train_dev, 17.26->17.14 after fg rescoring, +# or 18.6->18.74 before. So worse on eval2000, inconsistent on train_dev. +# The train and valid probs are actually quite different: -0.111 -> -0.1427 on train, +# -0.109 -> -0.783 on valid. So it looks like the edge effects do make a difference- +# maybe some kind of regularization effect? +# +# _t is as _s but setting pdf-boundary-penalty to 2.0 +# _s is as _q but setting pdf-boundary-penalty to 0.0 + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_u # Note: _sp will get added to this if $speed_perturb == true. + +# TDNN options +splice_indexes="-2,-1,0,1,2 -1,2 -3,3 -6,3 -6,3" + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=30 +max_param_change=1.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set data/lang_chain_d $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{1,2,3,4}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --truncate-deriv-weights 3 \ + --pdf-boundary-penalty 2.0 \ + --egs-dir exp/chain/tdnn_q_sp/egs \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 30" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --splice-indexes "$splice_indexes" \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --final-layer-normalize-target $final_layer_normalize_target \ + --relu-dim 850 \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --transition-scale 0.0 \ + --self-loop-scale 0.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_v.sh b/egs/swbd/s5c/local/chain/run_tdnn_v.sh new file mode 100755 index 00000000000..206e8aa45f9 --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_v.sh @@ -0,0 +1,222 @@ +#!/bin/bash + +# _v is as _u but setting pdf-boundary-penalty to 0.0 (as in t->s), +# and also trying a smaller language model: --lm-opts "--num-extra-states=0" +# +# It's worse: on train_dev, 18.73->19.29 with tg, 17.14->17.75 with fg. [around 0.6 abs worse] +# on eval2000, 20.1->20.7 with tg 18.2->18.6 with fg. [around 0.5 abs worse]. +# Now, the s->t stage was on average over the 4 conditions, about 0.2 worse, so the t->s change +# (changing pdf-boundary-penalty to 0.0) should have given 0.2 abs improvement. This means that we +# we have 0.7 to 0.8 abs degradation from setting --lm-opts "--num-extra-states=0". +# (Note: this could possibly be an interaction between the --truncate-deriv-weights and +# the pdf-boundary-penalty)? + +# +# +# _u is as _t but also setting --truncate-deriv-weights 3. +# _t is as _s but setting pdf-boundary-penalty to 2.0 +# _s is as _q but setting pdf-boundary-penalty to 0.0 + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_v # Note: _sp will get added to this if $speed_perturb == true. + +# TDNN options +splice_indexes="-2,-1,0,1,2 -1,2 -3,3 -6,3 -6,3" + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=30 +max_param_change=1.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set data/lang_chain_d $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{1,2,3,4}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --lm-opts "--num-extra-states=0" \ + --truncate-deriv-weights 3 \ + --pdf-boundary-penalty 0.0 \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 30" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --splice-indexes "$splice_indexes" \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --final-layer-normalize-target $final_layer_normalize_target \ + --relu-dim 850 \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --transition-scale 0.0 \ + --self-loop-scale 0.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_w.sh b/egs/swbd/s5c/local/chain/run_tdnn_w.sh new file mode 100755 index 00000000000..36a54e3e5c5 --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_w.sh @@ -0,0 +1,215 @@ +#!/bin/bash + +# _w is as _s (with --pdf-boundary-penalty 0.0) but setting +# --lm-opts "--num-extra-states=500" (like the opposite of +# the u->v change, which was very unhelpful). Also making a script change +# to set the same --pdf-boundary-penalty value on the train and valid egs for +# diagnostics (this won't affect WERs). +# See the top of run_tdnn_2a.sh for the WER comparisons for this experiment. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_w # Note: _sp will get added to this if $speed_perturb == true. + +# TDNN options +splice_indexes="-2,-1,0,1,2 -1,2 -3,3 -6,3 -6,3" + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=30 +max_param_change=1.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set data/lang_chain_d $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{1,2,3,4}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --lm-opts "--num-extra-states=500" \ + --pdf-boundary-penalty 0.0 \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 30" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --splice-indexes "$splice_indexes" \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --final-layer-normalize-target $final_layer_normalize_target \ + --relu-dim 850 \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --transition-scale 0.0 \ + --self-loop-scale 0.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_x.sh b/egs/swbd/s5c/local/chain/run_tdnn_x.sh new file mode 100755 index 00000000000..cf5bb635200 --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_x.sh @@ -0,0 +1,216 @@ +#!/bin/bash + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. +# See the top of run_tdnn_2a.sh for more detailed WER comparisons for this experiment. +# It's worse by about 0.3: on train_dev, +# before rescoring 16.96->17.22, after rescoring 18.45->18.67; on all of +# eval2000, before rescoring 20.1->20.4, after rescoring 18.0->18.4 + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_x # Note: _sp will get added to this if $speed_perturb == true. + +# TDNN options +splice_indexes="-2,-1,0,1,2 -1,2 -3,3 -6,3 -6,3" + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=30 +max_param_change=1.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set data/lang_chain_d $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --pdf-boundary-penalty 0.0 \ + --lm-opts "--num-extra-states=0" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 30" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --splice-indexes "$splice_indexes" \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --final-layer-normalize-target $final_layer_normalize_target \ + --relu-dim 850 \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --transition-scale 0.0 \ + --self-loop-scale 0.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_y.sh b/egs/swbd/s5c/local/chain/run_tdnn_y.sh new file mode 100755 index 00000000000..06a3eff123e --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_y.sh @@ -0,0 +1,245 @@ +#!/bin/bash + +# _y is as _s but trying --apply-deriv-weights false. (note: in the +# interim, the script was changed so the train and valid probs have --pdf-boundary-penalty 0 +# and are no longer comparable with the ones in _s. +# +# Compared to s, the results are improved: on train_dev, 18.45->18.04 with tg +# and 16.96->16.57 with fg; on all of eval2000, 20.1->19.8 with tg and 18.0 to +# 17.9 with fg. +# +# +# I recomputed the train and valid probs using the .486 model and no --pdf-boundary-penalty option, to +# be able to compre with the _s ones. In _s the (train,valid) probs at iter 485 were (-0.0691, -0.0997), +# in _y the (train,valid) probs at iter 486 were (-0.0655,-0.0998). So better on train, essentially +# the same on valid. It makes sense it would be better on train, since its overtraining is more +# closely aligned with the distribution of training segments on which we compute the objf-- also because +# we've simply trained more, i.e. equivalent to slightly more epochs. + + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_y # Note: _sp will get added to this if $speed_perturb == true. + +# TDNN options +splice_indexes="-2,-1,0,1,2 -1,2 -3,3 -6,3 -6,3" + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=30 +max_param_change=1.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set data/lang_chain_d $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{1,2,3,4}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --apply-deriv-weights false \ + --pdf-boundary-penalty 0.0 \ + --egs-dir exp/chain/tdnn_q_sp/egs \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 30" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --splice-indexes "$splice_indexes" \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --final-layer-normalize-target $final_layer_normalize_target \ + --relu-dim 850 \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --transition-scale 0.0 \ + --self-loop-scale 0.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; + +if [ $stage -le 15 ]; then + for decode_set in train_dev eval2000; do + ( + iter=300 + steps/nnet3/decode.sh --iter $iter --acwt 1.0 --post-decode-acwt 10.0 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff}_it$iter || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg}_it$iter || exit 1; + fi + ) & + done +fi +wait; + +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_z.sh b/egs/swbd/s5c/local/chain/run_tdnn_z.sh new file mode 100755 index 00000000000..db85df89a7d --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_z.sh @@ -0,0 +1,216 @@ +#!/bin/bash + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also w, which has --num-extra-states=500, and 2a, which has 8000). +# See the top of un_tdnn_2a.sh for the WER comparisons for this experiment. + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_z # Note: _sp will get added to this if $speed_perturb == true. + +# TDNN options +splice_indexes="-2,-1,0,1,2 -1,2 -3,3 -6,3 -6,3" + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=30 +max_param_change=1.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set data/lang_chain_d $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --pdf-boundary-penalty 0.0 \ + --lm-opts "--num-extra-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 30" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --splice-indexes "$splice_indexes" \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --final-layer-normalize-target $final_layer_normalize_target \ + --relu-dim 850 \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --transition-scale 0.0 \ + --self-loop-scale 0.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/show_wer.sh b/egs/swbd/s5c/local/chain/show_wer.sh new file mode 100755 index 00000000000..a82c4acf26d --- /dev/null +++ b/egs/swbd/s5c/local/chain/show_wer.sh @@ -0,0 +1,14 @@ +#!/bin/bash + +for l in $*; do + grep WER exp/chain/tdnn_${l}_sp/decode_train_dev_sw1_tg/wer_* | utils/best_wer.sh +done +for l in $*; do + grep WER exp/chain/tdnn_${l}_sp/decode_train_dev_sw1_fsh_fg/wer_* | utils/best_wer.sh +done +for l in $*; do + grep Sum exp/chain/tdnn_${l}_sp/decode_eval2000_sw1_tg/score*/*ys | grep -v swbd | utils/best_wer.sh +done +for l in $*; do + grep Sum exp/chain/tdnn_${l}_sp/decode_eval2000_sw1_fsh_fg/score*/*ys | grep -v swbd | utils/best_wer.sh +done diff --git a/egs/swbd/s5c/local/chain/tdnn/run_tdnn_v1.sh b/egs/swbd/s5c/local/chain/tdnn/run_tdnn_v1.sh new file mode 100755 index 00000000000..8e4ef2935a3 --- /dev/null +++ b/egs/swbd/s5c/local/chain/tdnn/run_tdnn_v1.sh @@ -0,0 +1,171 @@ +#!/bin/bash + +# this is based oni dan's tdnn_2o script +# it has a different splicing configuration +set -e + +# configs for 'chain' +affix= +stage=10 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_v1 # Note: _sp will get added to this if $speed_perturb == true. +decode_iter= + +# TDNN options +# this script uses the new tdnn config generator so it needs a final 0 to reflect that the final layer input has no splicing +splice_indexes="-2,-1,0,1,2 -1,0,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" +# smoothing options +pool_window= +pool_type='none' +pool_lpfilter_width= + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=1.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +relu_dim=700 +frames_per_eg=150 +remove_egs=false +common_egs_dir= + + + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --apply-deriv-weights false \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --splice-indexes "$splice_indexes" \ + --pool-type "$pool_type" \ + --pool-window "$pool_window" \ + --pool-lpfilter-width "$pool_lpfilter_width" \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --final-layer-normalize-target $final_layer_normalize_target \ + --relu-dim $relu_dim \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + --egs-dir "$common_egs_dir" \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + iter_opts=" --iter $decode_iter " + fi + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj 50 --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/tdnn/run_tdnn_v2.sh b/egs/swbd/s5c/local/chain/tdnn/run_tdnn_v2.sh new file mode 100755 index 00000000000..f5718837690 --- /dev/null +++ b/egs/swbd/s5c/local/chain/tdnn/run_tdnn_v2.sh @@ -0,0 +1,172 @@ +#!/bin/bash + +# this is same as v1 script but with l2 regularization +# it has a different splicing configuration +set -e + +# configs for 'chain' +affix= +stage=10 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_v2 # Note: _sp will get added to this if $speed_perturb == true. +decode_iter= + +# TDNN options +# this script uses the new tdnn config generator so it needs a final 0 to reflect that the final layer input has no splicing +splice_indexes="-2,-1,0,1,2 -1,0,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" +# smoothing options +pool_window= +pool_type='none' +pool_lpfilter_width= + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=1.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +relu_dim=700 +frames_per_eg=150 +remove_egs=false +common_egs_dir= + + + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --l2-regularize 0.00005 \ + --apply-deriv-weights false \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --splice-indexes "$splice_indexes" \ + --pool-type "$pool_type" \ + --pool-window "$pool_window" \ + --pool-lpfilter-width "$pool_lpfilter_width" \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --final-layer-normalize-target $final_layer_normalize_target \ + --relu-dim $relu_dim \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + --egs-dir "$common_egs_dir" \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + iter_opts=" --iter $decode_iter " + fi + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj 50 --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/tdnn/run_tdnn_v3.sh b/egs/swbd/s5c/local/chain/tdnn/run_tdnn_v3.sh new file mode 100755 index 00000000000..3b280712aeb --- /dev/null +++ b/egs/swbd/s5c/local/chain/tdnn/run_tdnn_v3.sh @@ -0,0 +1,173 @@ +#!/bin/bash + +# this is same as v2 script but with xent-regularization +# it has a different splicing configuration +set -e + +# configs for 'chain' +affix= +stage=10 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_v3 # Note: _sp will get added to this if $speed_perturb == true. +decode_iter= + +# TDNN options +# this script uses the new tdnn config generator so it needs a final 0 to reflect that the final layer input has no splicing +splice_indexes="-2,-1,0,1,2 -1,0,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" +# smoothing options +pool_window= +pool_type='none' +pool_lpfilter_width= + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=1.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +relu_dim=700 +frames_per_eg=150 +remove_egs=false +common_egs_dir= + + + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --xent-regularize 0.2 \ + --l2-regularize 0.00005 \ + --apply-deriv-weights false \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --splice-indexes "$splice_indexes" \ + --pool-type "$pool_type" \ + --pool-window "$pool_window" \ + --pool-lpfilter-width "$pool_lpfilter_width" \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --final-layer-normalize-target $final_layer_normalize_target \ + --relu-dim $relu_dim \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + --egs-dir "$common_egs_dir" \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + iter_opts=" --iter $decode_iter " + fi + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj 50 --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/tdnn/run_tdnn_v4.sh b/egs/swbd/s5c/local/chain/tdnn/run_tdnn_v4.sh new file mode 100755 index 00000000000..c10e296dee9 --- /dev/null +++ b/egs/swbd/s5c/local/chain/tdnn/run_tdnn_v4.sh @@ -0,0 +1,207 @@ +#!/bin/bash + +# this is same as v2 script but with xent-regularization +# it has a different splicing configuration +set -e + +# configs for 'chain' +affix= +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_v4 # Note: _sp will get added to this if $speed_perturb == true. +decode_iter= + +# TDNN options +# this script uses the new tdnn config generator so it needs a final 0 to reflect that the final layer input has no splicing +splice_indexes="-2,-1,0,1,2 -1,0,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" +# smoothing options +pool_window= +pool_type='none' +pool_lpfilter_width= +self_repair_scale=0.00001 +xent_regularize=0.2 +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=1.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +relu_dim=700 +frames_per_eg=150 +remove_egs=false +common_egs_dir= + + + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + echo "$0: creating neural net configs"; + if [ ! -z "$relu_dim" ]; then + dim_opts="--relu-dim $relu_dim" + else + dim_opts="--pnorm-input-dim $pnorm_input_dim --pnorm-output-dim $pnorm_output_dim" + fi + + # create the config files for nnet initialization + pool_opts= + pool_opts=$pool_opts${pool_type:+" --pool-type $pool_type "} + pool_opts=$pool_opts${pool_window:+" --pool-window $pool_window "} + pool_opts=$pool_opts${pool_lpfilter_width:+" --pool-lpfilter-width $pool_lpfilter_width "} + repair_opts=${self_repair_scale:+" --self-repair-scale $self_repair_scale "} + + python steps/nnet3/tdnn/make_configs.py $pool_opts \ + $repair_opts \ + --feat-dir data/${train_set}_hires \ + --ivector-dir exp/nnet3/ivectors_${train_set} \ + --tree-dir $treedir \ + $dim_opts \ + --splice-indexes "$splice_indexes" \ + --use-presoftmax-prior-scale false \ + --xent-regularize $xent_regularize \ + --include-log-softmax false \ + --final-layer-normalize-target $final_layer_normalize_target \ + $dir/configs || exit 1; +fi + + + +if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.dir "$common_egs_dir" \ + --egs.stage $get_egs_stage \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width $frames_per_eg \ + --trainer.num-chunk-per-minibatch $minibatch_size \ + --trainer.frames-per-iter 1200000 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.num-jobs-initial $num_jobs_initial \ + --trainer.optimization.num-jobs-final $num_jobs_final \ + --trainer.optimization.initial-effective-lrate $initial_effective_lrate \ + --trainer.optimization.final-effective-lrate $final_effective_lrate \ + --trainer.max-param-change $max_param_change \ + --cleanup.remove-egs $remove_egs \ + --feat-dir data/${train_set}_hires \ + --tree-dir $treedir \ + --lat-dir exp/tri4_lats_nodup$suffix \ + --dir $dir || exit 1; + +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + iter_opts=" --iter $decode_iter " + fi + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj 50 --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/tdnn/run_tdnn_v5.sh b/egs/swbd/s5c/local/chain/tdnn/run_tdnn_v5.sh new file mode 100755 index 00000000000..262e241296f --- /dev/null +++ b/egs/swbd/s5c/local/chain/tdnn/run_tdnn_v5.sh @@ -0,0 +1,205 @@ +#!/bin/bash + +# this is same as v2 script but with xent-regularization +# it has a different splicing configuration +set -e + +# configs for 'chain' +affix= +stage=10 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_v5 # Note: _sp will get added to this if $speed_perturb == true. +decode_iter= + +# TDNN options +# this script uses the new tdnn config generator so it needs a final 0 to reflect that the final layer input has no splicing +splice_indexes="-2,-1,0,1,2 -1,0,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" +# smoothing options +pool_window= +pool_type='none' +pool_lpfilter_width= +self_repair_scale=0.00001 +xent_regularize=0.2 +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=1.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +relu_dim=700 +frames_per_eg=150 +remove_egs=false +common_egs_dir= + + + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + echo "$0: creating neural net configs"; + if [ ! -z "$relu_dim" ]; then + dim_opts="--relu-dim $relu_dim" + else + dim_opts="--pnorm-input-dim $pnorm_input_dim --pnorm-output-dim $pnorm_output_dim" + fi + + # create the config files for nnet initialization + pool_opts= + pool_opts=$pool_opts${pool_type:+" --pool-type $pool_type "} + pool_opts=$pool_opts${pool_window:+" --pool-window $pool_window "} + pool_opts=$pool_opts${pool_lpfilter_width:+" --pool-lpfilter-width $pool_lpfilter_width "} + repair_opts=${self_repair_scale:+" --self-repair-scale $self_repair_scale "} + + python steps/nnet3/tdnn/make_configs.py $pool_opts \ + $repair_opts \ + --feat-dir data/${train_set}_hires \ + --ivector-dir exp/nnet3/ivectors_${train_set} \ + --tree-dir $treedir \ + $dim_opts \ + --splice-indexes "$splice_indexes" \ + --use-presoftmax-prior-scale false \ + --xent-regularize $xent_regularize \ + --include-log-softmax false \ + --final-layer-normalize-target $final_layer_normalize_target \ + $dir/configs || exit 1; +fi + +if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.00001 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.dir "$common_egs_dir" \ + --egs.stage $get_egs_stage \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width $frames_per_eg \ + --trainer.num-chunk-per-minibatch $minibatch_size \ + --trainer.frames-per-iter 1200000 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.num-jobs-initial $num_jobs_initial \ + --trainer.optimization.num-jobs-final $num_jobs_final \ + --trainer.optimization.initial-effective-lrate $initial_effective_lrate \ + --trainer.optimization.final-effective-lrate $final_effective_lrate \ + --trainer.max-param-change $max_param_change \ + --cleanup.remove-egs $remove_egs \ + --feat-dir data/${train_set}_hires \ + --tree-dir $treedir \ + --lat-dir exp/tri4_lats_nodup$suffix \ + --dir $dir || exit 1; + +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + iter_opts=" --iter $decode_iter " + fi + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj 50 --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/tdnn/run_tdnn_v6.sh b/egs/swbd/s5c/local/chain/tdnn/run_tdnn_v6.sh new file mode 100755 index 00000000000..866b5064757 --- /dev/null +++ b/egs/swbd/s5c/local/chain/tdnn/run_tdnn_v6.sh @@ -0,0 +1,207 @@ +#!/bin/bash + +# this is same as v2 script but with xent-regularization +# it has a different splicing configuration +set -e + +# configs for 'chain' +affix= +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_v6 # Note: _sp will get added to this if $speed_perturb == true. +decode_iter= + +# TDNN options +# this script uses the new tdnn config generator so it needs a final 0 to reflect that the final layer input has no splicing +splice_indexes="-2,-1,0,1,2 -1,0,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" +# smoothing options +pool_window= +pool_type='none' +pool_lpfilter_width= +self_repair_scale=0.00001 +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=2.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +relu_dim=768 +frames_per_eg=150 +remove_egs=false +common_egs_dir= +xent_regularize=0.1 + + + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + echo "$0: creating neural net configs"; + if [ ! -z "$relu_dim" ]; then + dim_opts="--relu-dim $relu_dim" + else + dim_opts="--pnorm-input-dim $pnorm_input_dim --pnorm-output-dim $pnorm_output_dim" + fi + + # create the config files for nnet initialization + pool_opts= + pool_opts=$pool_opts${pool_type:+" --pool-type $pool_type "} + pool_opts=$pool_opts${pool_window:+" --pool-window $pool_window "} + pool_opts=$pool_opts${pool_lpfilter_width:+" --pool-lpfilter-width $pool_lpfilter_width "} + repair_opts=${self_repair_scale:+" --self-repair-scale $self_repair_scale "} + + python steps/nnet3/tdnn/make_configs.py $pool_opts \ + $repair_opts \ + --feat-dir data/${train_set}_hires \ + --ivector-dir exp/nnet3/ivectors_${train_set} \ + --tree-dir $treedir \ + $dim_opts \ + --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3 -3,0,3 -6,-3,0" \ + --use-presoftmax-prior-scale false \ + --xent-regularize $xent_regularize \ + --include-log-softmax false \ + --final-layer-normalize-target $final_layer_normalize_target \ + $dir/configs || exit 1; +fi + + + +if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.dir exp/chain/tdnn_2y_sp/egs \ + --egs.stage $get_egs_stage \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width $frames_per_eg \ + --trainer.num-chunk-per-minibatch $minibatch_size \ + --trainer.frames-per-iter 1200000 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.num-jobs-initial $num_jobs_initial \ + --trainer.optimization.num-jobs-final $num_jobs_final \ + --trainer.optimization.initial-effective-lrate $initial_effective_lrate \ + --trainer.optimization.final-effective-lrate $final_effective_lrate \ + --trainer.max-param-change $max_param_change \ + --cleanup.remove-egs $remove_egs \ + --feat-dir data/${train_set}_hires \ + --tree-dir $treedir \ + --lat-dir exp/tri4_lats_nodup$suffix \ + --dir $dir || exit 1; + +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + iter_opts=" --iter $decode_iter " + fi + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj 50 --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/tdnn/run_tdnn_v7.sh b/egs/swbd/s5c/local/chain/tdnn/run_tdnn_v7.sh new file mode 100755 index 00000000000..ede618e0639 --- /dev/null +++ b/egs/swbd/s5c/local/chain/tdnn/run_tdnn_v7.sh @@ -0,0 +1,207 @@ +#!/bin/bash + +# this is same as v2 script but with xent-regularization +# it has a different splicing configuration +set -e + +# configs for 'chain' +affix= +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_v7 # Note: _sp will get added to this if $speed_perturb == true. +decode_iter= + +# TDNN options +# this script uses the new tdnn config generator so it needs a final 0 to reflect that the final layer input has no splicing +# smoothing options +pool_window= +pool_type='none' +pool_lpfilter_width= +self_repair_scale=0.00001 +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=2.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +relu_dim=576 +frames_per_eg=150 +remove_egs=false +common_egs_dir=exp/chain/tdnn_2y_sp/egs +xent_regularize=0.1 + + + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + echo "$0: creating neural net configs"; + if [ ! -z "$relu_dim" ]; then + dim_opts="--relu-dim $relu_dim" + else + dim_opts="--pnorm-input-dim $pnorm_input_dim --pnorm-output-dim $pnorm_output_dim" + fi + + # create the config files for nnet initialization + pool_opts= + pool_opts=$pool_opts${pool_type:+" --pool-type $pool_type "} + pool_opts=$pool_opts${pool_window:+" --pool-window $pool_window "} + pool_opts=$pool_opts${pool_lpfilter_width:+" --pool-lpfilter-width $pool_lpfilter_width "} + repair_opts=${self_repair_scale:+" --self-repair-scale $self_repair_scale "} + + steps/nnet3/tdnn/make_configs.py $pool_opts \ + $repair_opts \ + --feat-dir data/${train_set}_hires \ + --ivector-dir exp/nnet3/ivectors_${train_set} \ + --tree-dir $treedir \ + $dim_opts \ + --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3 -3,0,3 -6,-3,0 0" \ + --use-presoftmax-prior-scale false \ + --xent-regularize $xent_regularize \ + --xent-separate-forward-affine true \ + --include-log-softmax false \ + --final-layer-normalize-target $final_layer_normalize_target \ + $dir/configs || exit 1; +fi + + + +if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.dir "$common_egs_dir" \ + --egs.stage $get_egs_stage \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width $frames_per_eg \ + --trainer.num-chunk-per-minibatch $minibatch_size \ + --trainer.frames-per-iter 1200000 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.num-jobs-initial $num_jobs_initial \ + --trainer.optimization.num-jobs-final $num_jobs_final \ + --trainer.optimization.initial-effective-lrate $initial_effective_lrate \ + --trainer.optimization.final-effective-lrate $final_effective_lrate \ + --trainer.max-param-change $max_param_change \ + --cleanup.remove-egs $remove_egs \ + --feat-dir data/${train_set}_hires \ + --tree-dir $treedir \ + --lat-dir exp/tri4_lats_nodup$suffix \ + --dir $dir || exit 1; + +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + iter_opts=" --iter $decode_iter " + fi + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj 50 --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/tdnn/run_tdnn_v7_pool.sh b/egs/swbd/s5c/local/chain/tdnn/run_tdnn_v7_pool.sh new file mode 100755 index 00000000000..8aa54c556a4 --- /dev/null +++ b/egs/swbd/s5c/local/chain/tdnn/run_tdnn_v7_pool.sh @@ -0,0 +1,207 @@ +#!/bin/bash + +# this is same as v2 script but with xent-regularization +# it has a different splicing configuration +set -e + +# configs for 'chain' +affix= +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_v7 # Note: _sp will get added to this if $speed_perturb == true. +decode_iter= + +# TDNN options +# this script uses the new tdnn config generator so it needs a final 0 to reflect that the final layer input has no splicing +# smoothing options +pool_window= +pool_type='none' +pool_lpfilter_width= +self_repair_scale=0.00001 +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=2.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=64 +relu_dim=576 +frames_per_eg=150 +remove_egs=false +common_egs_dir= +xent_regularize=0.1 + + + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + echo "$0: creating neural net configs"; + if [ ! -z "$relu_dim" ]; then + dim_opts="--relu-dim $relu_dim" + else + dim_opts="--pnorm-input-dim $pnorm_input_dim --pnorm-output-dim $pnorm_output_dim" + fi + + # create the config files for nnet initialization + pool_opts= + pool_opts=$pool_opts${pool_type:+" --pool-type $pool_type "} + pool_opts=$pool_opts${pool_window:+" --pool-window $pool_window "} + pool_opts=$pool_opts${pool_lpfilter_width:+" --pool-lpfilter-width $pool_lpfilter_width "} + repair_opts=${self_repair_scale:+" --self-repair-scale $self_repair_scale "} + + steps/nnet3/tdnn/make_configs.py $pool_opts \ + $repair_opts \ + --feat-dir data/${train_set}_hires \ + --ivector-dir exp/nnet3/ivectors_${train_set} \ + --tree-dir $treedir \ + $dim_opts \ + --splice-indexes "-2,-1,0,1,2 -3,0,3 -3,0,3 -3,0,3 -3,0,3 -6,-3,0 0" \ + --use-presoftmax-prior-scale false \ + --xent-regularize $xent_regularize \ + --xent-separate-forward-affine true \ + --include-log-softmax false \ + --final-layer-normalize-target $final_layer_normalize_target \ + $dir/configs || exit 1; +fi + + + +if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.dir "$common_egs_dir" \ + --egs.stage $get_egs_stage \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width $frames_per_eg \ + --trainer.num-chunk-per-minibatch $minibatch_size \ + --trainer.frames-per-iter 1200000 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.num-jobs-initial $num_jobs_initial \ + --trainer.optimization.num-jobs-final $num_jobs_final \ + --trainer.optimization.initial-effective-lrate $initial_effective_lrate \ + --trainer.optimization.final-effective-lrate $final_effective_lrate \ + --trainer.max-param-change $max_param_change \ + --cleanup.remove-egs $remove_egs \ + --feat-dir data/${train_set}_hires \ + --tree-dir $treedir \ + --lat-dir exp/tri4_lats_nodup$suffix \ + --dir $dir || exit 1; + +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + iter_opts=" --iter $decode_iter " + fi + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj 50 --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/tdnn/run_tdnn_v8_pool.sh b/egs/swbd/s5c/local/chain/tdnn/run_tdnn_v8_pool.sh new file mode 100755 index 00000000000..e217fba7af5 --- /dev/null +++ b/egs/swbd/s5c/local/chain/tdnn/run_tdnn_v8_pool.sh @@ -0,0 +1,206 @@ +#!/bin/bash + +# same as v7 but with large dimension +set -e + +# configs for 'chain' +affix= +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_v8 # Note: _sp will get added to this if $speed_perturb == true. +decode_iter= + +# TDNN options +# this script uses the new tdnn config generator so it needs a final 0 to reflect that the final layer input has no splicing +# smoothing options +pool_window= +pool_type='none' +pool_lpfilter_width= +self_repair_scale=0.00001 +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=2.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=64 +relu_dim=768 +frames_per_eg=150 +remove_egs=false +common_egs_dir= +xent_regularize=0.1 + + + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + echo "$0: creating neural net configs"; + if [ ! -z "$relu_dim" ]; then + dim_opts="--relu-dim $relu_dim" + else + dim_opts="--pnorm-input-dim $pnorm_input_dim --pnorm-output-dim $pnorm_output_dim" + fi + + # create the config files for nnet initialization + pool_opts= + pool_opts=$pool_opts${pool_type:+" --pool-type $pool_type "} + pool_opts=$pool_opts${pool_window:+" --pool-window $pool_window "} + pool_opts=$pool_opts${pool_lpfilter_width:+" --pool-lpfilter-width $pool_lpfilter_width "} + repair_opts=${self_repair_scale:+" --self-repair-scale $self_repair_scale "} + + steps/nnet3/tdnn/make_configs.py $pool_opts \ + $repair_opts \ + --feat-dir data/${train_set}_hires \ + --ivector-dir exp/nnet3/ivectors_${train_set} \ + --tree-dir $treedir \ + $dim_opts \ + --splice-indexes "-2,-1,0,1,2 -3,0,3 -3,0,3 -3,0,3 -3,0,3 -6,-3,0 0" \ + --use-presoftmax-prior-scale false \ + --xent-regularize $xent_regularize \ + --xent-separate-forward-affine true \ + --include-log-softmax false \ + --final-layer-normalize-target $final_layer_normalize_target \ + $dir/configs || exit 1; +fi + + + +if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.dir "$common_egs_dir" \ + --egs.stage $get_egs_stage \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width $frames_per_eg \ + --trainer.num-chunk-per-minibatch $minibatch_size \ + --trainer.frames-per-iter 1200000 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.num-jobs-initial $num_jobs_initial \ + --trainer.optimization.num-jobs-final $num_jobs_final \ + --trainer.optimization.initial-effective-lrate $initial_effective_lrate \ + --trainer.optimization.final-effective-lrate $final_effective_lrate \ + --trainer.max-param-change $max_param_change \ + --cleanup.remove-egs $remove_egs \ + --feat-dir data/${train_set}_hires \ + --tree-dir $treedir \ + --lat-dir exp/tri4_lats_nodup$suffix \ + --dir $dir || exit 1; + +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + iter_opts=" --iter $decode_iter " + fi + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj 50 --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/confidence_calibration.sh b/egs/swbd/s5c/local/confidence_calibration.sh new file mode 100755 index 00000000000..de330866622 --- /dev/null +++ b/egs/swbd/s5c/local/confidence_calibration.sh @@ -0,0 +1,84 @@ +#!/bin/bash +. cmd.sh +. path.sh + +# Global options, +graph=exp/tri4/graph_sw1_tg +arpa_gz=data/local/lm/sw1_fsh.o3g.kn.gz +lmwt=14 + +# Dev-set options, +dev_data=data/train_dev +dev_latdir=exp/tri4/decode_dev_sw1_tg + +# Eval-set options, +eval_data=data/eval2000 +eval_latdir=exp/tri4/decode_eval2000_sw1_tg + +. utils/parse_options.sh +set -euxo pipefail + +# Derived options, +dev_caldir=$dev_latdir/confidence_$lmwt +eval_caldir=$eval_latdir/confidence_$lmwt + +###### Data preparation, + +# Prepare filtering for excluding data from train-set (1 .. keep word, 0 .. exclude word), +# - only excludes from training-targets, the confidences are recalibrated for all the words, +word_filter=$(mktemp) +awk '{ keep_the_word = $1 !~ /^(\[.*\]|<.*>|%.*|!.*|-.*|.*-)$/; print $0, keep_the_word }' \ + $graph/words.txt >$word_filter + +# Calcualte the word-length, +word_length=$(mktemp) +awk '{if(r==0) { len_hash[$1] = NF-2; } + if(r==1) { if(len_hash[$1]) { len = len_hash[$1]; } else { len = -1 } + print $0, len; }}' \ + r=0 $graph/phones/align_lexicon.txt \ + r=1 $graph/words.txt \ + >$word_length + +# Extract unigrams, +unigrams=$(mktemp); steps/conf/parse_arpa_unigrams.py $graph/words.txt $arpa_gz $unigrams + +###### Paste the 'word-specific' features (first 4 columns have fixed position, more feature-columns can be added), +# Format: "word word_id filter length other_features" +word_feats=$(mktemp) +paste $word_filter <(awk '{ print $3 }' $word_length) <(awk '{ print $3 }' $unigrams) > $word_feats + + +###### Train the calibration, +steps/conf/train_calibration.sh --cmd "$decode_cmd" --lmwt $lmwt \ + $dev_data $graph $word_feats $dev_latdir $dev_caldir + +###### Apply the calibration to eval set, +steps/conf/apply_calibration.sh --cmd "$decode_cmd" \ + $eval_data $graph $eval_latdir $dev_caldir $eval_caldir +# The final confidences are here '$eval_caldir/ctm_calibrated', + +###### Sclite scoring, +# We will produce NCE which shows the ``quality'' of the confidences. +# Please compare with the default scoring script for your database. + +# Scoring tools, +hubscr=$KALDI_ROOT/tools/sctk/bin/hubscr.pl +hubdir=`dirname $hubscr` + +# Inputs, +ctm=$eval_caldir/ctm_calibrated +stm=$eval_data/stm +glm=$eval_data/glm + +# Normalizng CTM, just like in 'local/score_sclite.sh', +cat $ctm | grep -i -v -E '\[NOISE|LAUGHTER|VOCALIZED-NOISE\]' | \ + grep -i -v -E '' | \ + grep -i -v -E ' (UH|UM|EH|MM|HM|AH|HUH|HA|ER|OOF|HEE|ACH|EEE|EW) ' | \ + awk '$5 !~ /^.*-$/' | \ + local/map_acronyms_ctm.py -M data/local/dict_nosp/acronyms.map -i - -o ${ctm}.filt + +# Mapping the time info to global, +utils/convert_ctm.pl $eval_data/segments $eval_data/reco2file_and_channel <${ctm}.filt >${ctm}.filt.conv + +# Scoring, +$hubscr -p $hubdir -V -l english -h hub5 -g $glm -r $stm ${ctm}.filt.conv diff --git a/egs/swbd/s5c/local/map_acronyms_ctm.py b/egs/swbd/s5c/local/map_acronyms_ctm.py index c7f002cb2c7..983c02205d9 100755 --- a/egs/swbd/s5c/local/map_acronyms_ctm.py +++ b/egs/swbd/s5c/local/map_acronyms_ctm.py @@ -15,6 +15,9 @@ parser.add_argument('-M','--Map',help='Input acronyms map', required=True) args = parser.parse_args() +if args.input == '-': args.input = '/dev/stdin' +if args.output == '-': args.output = '/dev/stdout' + dict_acronym_back = {} fin_map = open(args.Map, "r") for line in fin_map: diff --git a/egs/swbd/s5c/local/nnet/run_dnn.sh b/egs/swbd/s5c/local/nnet/run_dnn.sh index d0bc50d6ea7..0ad87100e31 100755 --- a/egs/swbd/s5c/local/nnet/run_dnn.sh +++ b/egs/swbd/s5c/local/nnet/run_dnn.sh @@ -30,28 +30,29 @@ has_fisher=true . utils/parse_options.sh || exit 1; # +set -euxo pipefail + if [ $stage -le 0 ]; then # Store fMLLR features, so we can train on them easily, # eval2000 dir=$data_fmllr/eval2000 steps/nnet/make_fmllr_feats.sh --nj 10 --cmd "$train_cmd" \ --transform-dir $gmmdir/decode_eval2000_sw1_tg \ - $dir data/eval2000 $gmmdir $dir/log $dir/data || exit 1 + $dir data/eval2000 $gmmdir $dir/log $dir/data # train dir=$data_fmllr/train_nodup steps/nnet/make_fmllr_feats.sh --nj 10 --cmd "$train_cmd" \ --transform-dir ${gmmdir}_ali_nodup \ - $dir data/train_nodup $gmmdir $dir/log $dir/data || exit 1 + $dir data/train_nodup $gmmdir $dir/log $dir/data # split the data : 90% train 10% cross-validation (held-out) - utils/subset_data_dir_tr_cv.sh $dir ${dir}_tr90 ${dir}_cv10 || exit 1 + utils/subset_data_dir_tr_cv.sh $dir ${dir}_tr90 ${dir}_cv10 fi if [ $stage -le 1 ]; then # Pre-train DBN, i.e. a stack of RBMs dir=exp/dnn5b_pretrain-dbn - (tail --pid=$$ -F $dir/log/pretrain_dbn.log 2>/dev/null)& # forward log $cuda_cmd $dir/log/pretrain_dbn.log \ - steps/nnet/pretrain_dbn.sh --rbm-iter 1 $data_fmllr/train_nodup $dir || exit 1; + steps/nnet/pretrain_dbn.sh --rbm-iter 1 $data_fmllr/train_nodup $dir fi if [ $stage -le 2 ]; then @@ -60,16 +61,15 @@ if [ $stage -le 2 ]; then ali=${gmmdir}_ali_nodup feature_transform=exp/dnn5b_pretrain-dbn/final.feature_transform dbn=exp/dnn5b_pretrain-dbn/6.dbn - (tail --pid=$$ -F $dir/log/train_nnet.log 2>/dev/null)& # forward log # Train $cuda_cmd $dir/log/train_nnet.log \ steps/nnet/train.sh --feature-transform $feature_transform --dbn $dbn --hid-layers 0 --learn-rate 0.008 \ - $data_fmllr/train_nodup_tr90 $data_fmllr/train_nodup_cv10 data/lang $ali $ali $dir || exit 1; + $data_fmllr/train_nodup_tr90 $data_fmllr/train_nodup_cv10 data/lang $ali $ali $dir # Decode with the trigram swbd language model. steps/nnet/decode.sh --nj 20 --cmd "$decode_cmd" \ --config conf/decode_dnn.config --acwt 0.08333 \ $gmmdir/graph_sw1_tg $data_fmllr/eval2000 \ - $dir/decode_eval2000_sw1_tg || exit 1; + $dir/decode_eval2000_sw1_tg if $has_fisher; then # Rescore with the 4gram swbd+fisher language model. steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ @@ -80,8 +80,7 @@ fi # Sequence training using sMBR criterion, we do Stochastic-GD -# with per-utterance updates. We use usually good acwt 0.1 -# Lattices are re-generated after 1st epoch, to get faster convergence. +# with per-utterance updates. The typical acwt value is around 0.1 dir=exp/dnn5b_pretrain-dbn_dnn_smbr srcdir=exp/dnn5b_pretrain-dbn_dnn acwt=0.0909 @@ -89,62 +88,28 @@ acwt=0.0909 if [ $stage -le 3 ]; then # First we generate lattices and alignments: steps/nnet/align.sh --nj 250 --cmd "$train_cmd" \ - $data_fmllr/train_nodup data/lang $srcdir ${srcdir}_ali || exit 1; + $data_fmllr/train_nodup data/lang $srcdir ${srcdir}_ali steps/nnet/make_denlats.sh --nj 10 --sub-split 100 --cmd "$decode_cmd" --config conf/decode_dnn.config \ - --acwt $acwt $data_fmllr/train_nodup data/lang $srcdir ${srcdir}_denlats || exit 1; + --acwt $acwt $data_fmllr/train_nodup data/lang $srcdir ${srcdir}_denlats fi if [ $stage -le 4 ]; then # Re-train the DNN by 1 iteration of sMBR - steps/nnet/train_mpe.sh --cmd "$cuda_cmd" --num-iters 1 --acwt $acwt --do-smbr true \ - $data_fmllr/train_nodup data/lang $srcdir ${srcdir}_ali ${srcdir}_denlats $dir || exit 1 - # Decode (reuse HCLG graph) - for ITER in 1; do - # Decode with the trigram swbd language model. - steps/nnet/decode.sh --nj 20 --cmd "$decode_cmd" \ - --config conf/decode_dnn.config \ - --nnet $dir/${ITER}.nnet --acwt $acwt \ - $gmmdir/graph_sw1_tg $data_fmllr/eval2000 \ - $dir/decode_eval2000_sw1_tg || exit 1; - if $has_fisher; then - # Rescore with the 4gram swbd+fisher language model. - steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ - data/lang_sw1_{tg,fsh_fg} data/eval2000 \ - $dir/decode_eval2000_sw1_{tg,fsh_fg} - fi - done -fi - -# Re-generate lattices, run 4 more sMBR iterations -dir=exp/dnn5b_pretrain-dbn_dnn_smbr_i1lats -srcdir=exp/dnn5b_pretrain-dbn_dnn_smbr -acwt=0.0909 - -if [ $stage -le 5 ]; then - # First we generate lattices and alignments: - steps/nnet/align.sh --nj 250 --cmd "$train_cmd" \ - $data_fmllr/train_nodup data/lang $srcdir ${srcdir}_ali || exit 1; - steps/nnet/make_denlats.sh --nj 10 --sub-split 100 --cmd "$decode_cmd" --config conf/decode_dnn.config \ - --acwt $acwt $data_fmllr/train_nodup data/lang $srcdir ${srcdir}_denlats || exit 1; -fi - -if [ $stage -le 6 ]; then - # Re-train the DNN by 1 iteration of sMBR - steps/nnet/train_mpe.sh --cmd "$cuda_cmd" --num-iters 2 --acwt $acwt --do-smbr true \ - $data_fmllr/train_nodup data/lang $srcdir ${srcdir}_ali ${srcdir}_denlats $dir || exit 1 + steps/nnet/train_mpe.sh --cmd "$cuda_cmd" --num-iters 4 --acwt $acwt --do-smbr true \ + $data_fmllr/train_nodup data/lang $srcdir ${srcdir}_ali ${srcdir}_denlats $dir # Decode (reuse HCLG graph) - for ITER in 1 2; do + for ITER in 4 3 2 1; do # Decode with the trigram swbd language model. steps/nnet/decode.sh --nj 20 --cmd "$decode_cmd" \ --config conf/decode_dnn.config \ --nnet $dir/${ITER}.nnet --acwt $acwt \ $gmmdir/graph_sw1_tg $data_fmllr/eval2000 \ - $dir/decode_eval2000_sw1_tg || exit 1; + $dir/decode_eval2000_sw1_tg_it$ITER if $has_fisher; then # Rescore with the 4gram swbd+fisher language model. steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ data/lang_sw1_{tg,fsh_fg} data/eval2000 \ - $dir/decode_eval2000_sw1_{tg,fsh_fg} + $dir/decode_eval2000_sw1_{tg,fsh_fg}_it$ITER fi done fi diff --git a/egs/swbd/s5c/local/nnet/run_dnn_tandem_uc.sh b/egs/swbd/s5c/local/nnet/run_dnn_tandem_uc.sh index f15add1f3f5..4cd6a21873f 100755 --- a/egs/swbd/s5c/local/nnet/run_dnn_tandem_uc.sh +++ b/egs/swbd/s5c/local/nnet/run_dnn_tandem_uc.sh @@ -12,221 +12,186 @@ # Config: stage=0 # resume training with --stage=N +has_fisher=true # End of config. . utils/parse_options.sh || exit 1; # +set -euxo pipefail + +train_src=data/train_nodup +train=data-fbank-pitch/train_nodup + +dev_src=data/eval2000 +dev=data-fbank-pitch/eval2000 + +gmmdir=exp/tri4 + +lang=data/lang +lang_test=data/lang_sw1_tg + if [ $stage -le 1 ]; then - # prepare the FBANK+f0 features - # eval2000 - dir=data-fbank-pitch/eval2000; srcdir=data/eval2000 - (mkdir -p $dir; cp $srcdir/* $dir; ) - steps/make_fbank_pitch.sh --cmd "$train_cmd" --nj 20 $dir $dir/log $dir/data || exit 1; - steps/compute_cmvn_stats.sh $dir $dir/log $dir/data || exit 1; - - # training set - dir=data-fbank-pitch/train; srcdir=data/train - (mkdir -p $dir; cp $srcdir/* $dir; ) - steps/make_fbank_pitch.sh --cmd "$train_cmd" --nj 20 $dir $dir/log $dir/data || exit 1; - steps/compute_cmvn_stats.sh $dir $dir/log $dir/data || exit 1; + [ -e $dev ] && echo "Existing '$dev', better quit than overwrite!!!" && exit 1 + # prepare the FBANK+f0 features, + # eval2000, + utils/copy_data_dir.sh $dev_src $dev; rm $dev/{feats,cmvn}.scp + steps/make_fbank_pitch.sh --cmd "$train_cmd" --nj 20 $dev $dev/log $dev/data + steps/compute_cmvn_stats.sh $dev $dev/log $dev/data + # training set, + utils/copy_data_dir.sh $train_src $train; rm $train/{feats,cmvn}.scp + steps/make_fbank_pitch.sh --cmd "$train_cmd" --nj 20 $train $train/log $train/data + steps/compute_cmvn_stats.sh $train $train/log $train/data fi if [ $stage -le 2 ]; then - # Prepare same subsets as in the main MFCC-GMM recipe, these will be used - # during during building GMM system from flat-start, later in the Tandem recipe. - data=data-fbank-pitch - - # Use the first 4k sentences as dev set. Note: when we trained the LM, we used - utils/subset_data_dir.sh --first $data/train 4000 $data/train_dev # 5hr 6min - n=$[`cat data/train/segments | wc -l` - 4000] - utils/subset_data_dir.sh --last $data/train $n $data/train_nodev - - # Prepare data for training mono - utils/subset_data_dir.sh --shortest $data/train_nodev 100000 $data/train_100kshort - utils/subset_data_dir.sh $data/train_100kshort 10000 $data/train_10k - local/remove_dup_utts.sh 100 $data/train_10k $data/train_10k_nodup - - # Take the first 30k utterances (about 1/8th of the data) - utils/subset_data_dir.sh --first $data/train_nodev 30000 $data/train_30k - local/remove_dup_utts.sh 200 $data/train_30k $data/train_30k_nodup - - # Take the first 100k utterances (just under half the data); we'll use - # this for later stages of training. - utils/subset_data_dir.sh --first $data/train_nodev 100000 $data/train_100k - local/remove_dup_utts.sh 200 $data/train_100k $data/train_100k_nodup - - # Full training dataset, - local/remove_dup_utts.sh 300 $data/train_nodev $data/train_nodup - # split the data : 90% train 10% cross-validation (held-out) - dir=$data/train_nodup - utils/subset_data_dir_tr_cv.sh $dir ${dir}_tr90 ${dir}_cv10 || exit 1 + # split the data : 90% train, 10% cross-validation (held-out set), + utils/subset_data_dir_tr_cv.sh $train ${train}_tr90 ${train}_cv10 fi ######################################################################################### # Let's build universal-context bottleneck network # - Universal context MLP is a hierarchy of two bottleneck neural networks -# - The first network can see a limited range of frames (11 frames) -# - The second network sees concatenation of bottlneck outputs of the first -# network, with temporal shifts -10 -5 0 5 10, (in total a range of 31 frames +# - The first network has limited range of frames on input (11 frames) +# - The second network input is a concatenation of bottlneck outputs from the first +# network, with temporal shifts -10 -5..5 10, (in total a range of 31 frames # in the original feature space) -# - This structure has been reported to produce superior performance -# compared to a network with single bottleneck +# - This structure produces superior performance w.r.t. single bottleneck network # if [ $stage -le 3 ]; then - # 1st network, overall context +/-5 frames - # - the topology is 90_1500_1500_80_1500_NSTATES, linear bottleneck - dir=exp/nnet5b_uc-part1 - ali=exp/tri4_ali_nodup + # Train 1st network, overall context +/-5 frames + # - the topology is 90_1500_1500_80_1500_NSTATES, linear bottleneck, + dir=exp/nnet5uc-part1 + ali=${gmmdir}_ali_nodup $cuda_cmd $dir/log/train_nnet.log \ - steps/nnet/train.sh --hid-layers 2 --hid-dim 1500 --bn-dim 80 --apply-cmvn true \ - --copy-feats false \ + steps/nnet/train.sh --hid-layers 2 --hid-dim 1500 --bn-dim 80 \ + --cmvn-opts "--norm-means=true --norm-vars=false" \ --feat-type traps --splice 5 --traps-dct-basis 6 --learn-rate 0.008 \ - data-fbank-pitch/train_nodup_tr90 data-fbank-pitch/train_nodup_cv10 data/lang ${ali} ${ali} $dir || exit 1; + ${train}_tr90 ${train}_cv10 $lang $ali $ali $dir fi +# if [ $stage -le 4 ]; then # Compose feature_transform for the next stage, - # - remaining part of the first network is fixed - dir=exp/nnet5b_uc-part1 + # - remaining part of the first network is fixed, + dir=exp/nnet5uc-part1 feature_transform=$dir/final.feature_transform.part1 - nnet-concat $dir/final.feature_transform \ - "nnet-copy --remove-last-layers=4 --binary=false $dir/final.nnet - |" \ - "utils/nnet/gen_splice.py --fea-dim=80 --splice=2 --splice-step=5 |" \ - $feature_transform || exit 1 + # Create splice transform, + nnet-initialize <(echo " 80 1040 -10 -5:5 10 ") \ + $dir/splice_for_bottleneck.nnet + # Concatanate the input-transform, 1stage network, splicing, + nnet-concat $dir/final.feature_transform "nnet-copy --remove-last-components=4 $dir/final.nnet - |" \ + $dir/splice_for_bottleneck.nnet $feature_transform - # 2nd network, overall context +/-15 frames - # - the topology will be 400_1500_1500_30_1500_NSTATES, again, the bottleneck is linear - dir=exp/nnet5b_uc-part2 - ali=exp/tri4_ali_nodup + # Train 2nd network, overall context +/-15 frames, + # - the topology will be 1040_1500_1500_30_1500_NSTATES, linear bottleneck, + # - cmvn_opts get imported inside 'train.sh', + dir=exp/nnet5uc-part2 + ali=${gmmdir}_ali_nodup $cuda_cmd $dir/log/train_nnet.log \ - steps/nnet/train.sh --hid-layers 2 --hid-dim 1500 --bn-dim 30 --apply-cmvn true \ + steps/nnet/train.sh --hid-layers 2 --hid-dim 1500 --bn-dim 30 \ --feature-transform $feature_transform --learn-rate 0.008 \ - data-fbank-pitch/train_nodup_tr90 data-fbank-pitch/train_nodup_cv10 data/lang ${ali} ${ali} $dir || exit 1; + ${train}_tr90 ${train}_cv10 $lang $ali $ali $dir fi # ######################################################################################### +# Decode the 2nd DNN, if [ $stage -le 5 ]; then - # Store the BN-features - data=data-bn/nnet5b_uc-part2 - srcdata=data-fbank-pitch/ - nnet=exp/nnet5b_uc-part2 - # eval2000 - steps/nnet/make_bn_feats.sh --cmd "$train_cmd" --nj 20 $data/eval2000 $srcdata/eval2000 \ - $nnet $data/eval2000/log $data/eval2000/data || exit 1 - # trainig data (full set) - steps/nnet/make_bn_feats.sh --cmd "$train_cmd" --nj 40 $data/train $srcdata/train \ - $nnet $data/train/log $data/train/data || exit 1 - - # Compute CMVN of the BN-features - dir=data-bn/nnet5b_uc-part2/train - steps/compute_cmvn_stats.sh $dir $dir/log $dir/data || exit 1; - dir=data-bn/nnet5b_uc-part2/eval2000 - steps/compute_cmvn_stats.sh $dir $dir/log $dir/data || exit 1; + dir=exp/nnet5uc-part2 + steps/nnet/decode.sh --nj 20 --cmd "$decode_cmd" --config conf/decode_dnn.config --acwt 0.08333 \ + $gmmdir/graph_sw1_tg $dev $dir/decode_eval2000_sw1_tg fi +# Store the BN-features, +nnet=exp/nnet5uc-part2 +train_bn=data-$(basename $nnet)/train_nodup +dev_bn=data-$(basename $nnet)/eval2000 if [ $stage -le 6 ]; then - # Prepare BN-feature subsets same as with MFCCs in run.sh - data=data-bn/nnet5b_uc-part2/ - - # Use the first 4k sentences as dev set. - utils/subset_data_dir.sh --first $data/train 4000 $data/train_dev # 5hr 6min - n=$[`cat data/train/segments | wc -l` - 4000] - utils/subset_data_dir.sh --last $data/train $n $data/train_nodev - - # Prepare data for training mono - utils/subset_data_dir.sh --shortest $data/train_nodev 100000 $data/train_100kshort - utils/subset_data_dir.sh $data/train_100kshort 10000 $data/train_10k - local/remove_dup_utts.sh 100 $data/train_10k $data/train_10k_nodup - - # Take the first 30k utterances (about 1/8th of the data) - utils/subset_data_dir.sh --first $data/train_nodev 30000 $data/train_30k - local/remove_dup_utts.sh 200 $data/train_30k $data/train_30k_nodup - - # Take the first 100k utterances (just under half the data); we'll use - # this for later stages of training. - utils/subset_data_dir.sh --first $data/train_nodev 100000 $data/train_100k - local/remove_dup_utts.sh 200 $data/train_100k $data/train_100k_nodup - - # Full dataset - local/remove_dup_utts.sh 300 $data/train_nodev $data/train_nodup + # eval2000, + steps/nnet/make_bn_feats.sh --cmd "$train_cmd" --nj 20 $dev_bn $dev $nnet $dev_bn/log $dev_bn/data + # trainig, + steps/nnet/make_bn_feats.sh --cmd "$train_cmd --max-jobs-run 50" --nj 200 $train_bn $train $nnet $train_bn/log $train_bn/data + # For further GMM training, we have to produce cmvn statistics even if not used!!! + steps/compute_cmvn_stats.sh $dev_bn $dev_bn/log $dev_bn/data + steps/compute_cmvn_stats.sh $train_bn $train_bn/log $train_bn/data fi - -# Start building the tandem GMM system -# - train from mono to tri4b, run bmmi training -bndata=data-bn/nnet5b_uc-part2/ - +# Use single-pass retraining to build new GMM system on top of bottleneck features, if [ $stage -le 7 ]; then - steps/tandem/train_mono.sh --nj 10 --cmd "$train_cmd" \ - data/train_10k_nodup $bndata/train_10k_nodup data/lang exp/tandem2uc-mono0a || exit 1; - - steps/tandem/align_si.sh --nj 30 --cmd "$train_cmd" \ - data/train_30k_nodup $bndata/train_30k_nodup data/lang exp/tandem2uc-mono0a exp/tandem2uc-mono0a_ali || exit 1; - - steps/tandem/train_deltas.sh --cmd "$train_cmd" \ - 3200 30000 data/train_30k_nodup $bndata/train_30k_nodup data/lang exp/tandem2uc-mono0a_ali exp/tandem2uc-tri1 || exit 1; - - utils/mkgraph.sh data/lang_test exp/tandem2uc-tri1 exp/tandem2uc-tri1/graph - - steps/tandem/decode.sh --nj 30 --cmd "$decode_cmd" --config conf/decode_tandem.config \ - exp/tandem2uc-tri1/graph data/eval2000 $bndata/eval2000 exp/tandem2uc-tri1/decode_eval2000 + dir=exp/tri6uc + ali_src=${gmmdir}_ali_nodup + graph=$dir/graph_${lang_test#*lang_} + # Train, + # GMM on bn features, no cmvn, no lda-mllt, + steps/train_deltas.sh --cmd "$train_cmd" --delta-opts "--delta-order=0" \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --beam 20 --retry-beam 80 \ + 11500 200000 $train_bn $lang $ali_src $dir + # Decode, + utils/mkgraph.sh $lang_test $dir $graph + steps/decode.sh --nj 30 --cmd "$decode_cmd" --acwt 0.05 --beam 15.0 --lattice-beam 8.0 \ + $graph $dev_bn $dir/decode_$(basename $dev_bn)_$(basename $graph) + # Align, + steps/align_fmllr.sh --nj 50 --cmd "$train_cmd" \ + --beam 20 --retry-beam 80 \ + $train_bn $lang $dir ${dir}_ali fi +# Train SAT-adapted GMM on bottleneck features, if [ $stage -le 8 ]; then - steps/tandem/align_si.sh --nj 30 --cmd "$train_cmd" \ - data/train_30k_nodup $bndata/train_30k_nodup data/lang exp/tandem2uc-tri1 exp/tandem2uc-tri1_ali || exit 1; - - steps/tandem/train_deltas.sh --cmd "$train_cmd" \ - 3200 30000 data/train_30k_nodup $bndata/train_30k_nodup data/lang exp/tandem2uc-tri1_ali exp/tandem2uc-tri2 || exit 1; - - utils/mkgraph.sh data/lang_test exp/tandem2uc-tri2 exp/tandem2uc-tri2/graph || exit 1; - steps/tandem/decode.sh --nj 30 --cmd "$decode_cmd" --config conf/decode_tandem.config \ - exp/tandem2uc-tri2/graph data/eval2000 $bndata/eval2000 exp/tandem2uc-tri2/decode_eval2000 || exit 1; + dir=exp/tri7uc-sat + ali=exp/tri6uc_ali + graph=$dir/graph_${lang_test#*lang_} + # Train, + # fmllr-gmm system on bottleneck features, + # - no cmvn, put fmllr to the features directly (no lda), + # - note1 : we don't need cmvn, similar effect has diagonal of fmllr transform, + # - note2 : lda+mllt was causing a small hit <0.5%, + steps/train_sat.sh --cmd "$train_cmd" --beam 20 --retry-beam 80 \ + 11500 200000 $train_bn $lang $ali $dir + # Decode, + utils/mkgraph.sh $lang_test $dir $graph + steps/decode_fmllr.sh --nj 30 --cmd "$decode_cmd" --acwt 0.05 --beam 15.0 --lattice-beam 8.0 \ + $graph $dev_bn $dir/decode_$(basename $dev_bn)_$(basename $graph) fi +# Prepare alignments and lattices for bMMI training, if [ $stage -le 9 ]; then - steps/tandem/align_si.sh --nj 30 --cmd "$train_cmd" \ - data/train_100k_nodup $bndata/train_100k_nodup data/lang exp/tandem2uc-tri2 exp/tandem2uc-tri2_ali || exit 1; - - # Train tri3b, which is LDA+MLLT, on 100k_nodup data. - steps/tandem/train_lda_mllt.sh --cmd "$train_cmd" \ - --splice-opts "--left-context=3 --right-context=3" \ - 5500 90000 data/train_100k_nodup $bndata/train_100k_nodup data/lang exp/tandem2uc-tri2_ali exp/tandem2uc-tri3b || exit 1; - - utils/mkgraph.sh data/lang_test exp/tandem2uc-tri3b exp/tandem2uc-tri3b/graph || exit 1; - steps/tandem/decode.sh --nj 30 --cmd "$decode_cmd" --config conf/decode_tandem.config \ - exp/tandem2uc-tri3b/graph data/eval2000 $bndata/eval2000 exp/tandem2uc-tri3b/decode_eval2000 || exit 1; + dir=exp/tri7uc-sat + # Align, + steps/align_fmllr.sh --nj 50 --cmd "$train_cmd" --beam 20 --retry-beam 80 \ + $train_bn $lang $dir ${dir}_ali_nodup + # Make denlats, + steps/make_denlats.sh --nj 50 --cmd "$decode_cmd" --acwt 0.05 \ + --config conf/decode.config --transform-dir ${dir}_ali_nodup \ + $train_bn $lang $dir ${dir}_denlats_nodup fi +# 4 iterations of bMMI seems to work well overall. The number of iterations is +# used as an explicit argument even though train_mmi.sh will use 4 iterations by +# default. +num_mmi_iters=4 if [ $stage -le 10 ]; then - # From now, we start building a more serious system (with SAT), - # and we'll do the alignment with fMLLR. - steps/tandem/align_fmllr.sh --nj 30 --cmd "$train_cmd" \ - data/train_nodup $bndata/train_nodup data/lang exp/tandem2uc-tri3b exp/tandem2uc-tri3b_ali_nodup || exit 1; - - steps/tandem/train_sat.sh --cmd "$train_cmd" \ - 11500 200000 data/train_nodup $bndata/train_nodup data/lang exp/tandem2uc-tri3b_ali_nodup exp/tandem2uc-tri4b || exit 1; - - utils/mkgraph.sh data/lang_test exp/tandem2uc-tri4b exp/tandem2uc-tri4b/graph || exit 1 - steps/tandem/decode_fmllr.sh --nj 30 --cmd "$decode_cmd" --config conf/decode_tandem.config \ - exp/tandem2uc-tri4b/graph data/eval2000 $bndata/eval2000 exp/tandem2uc-tri4b/decode_eval2000 || exit 1 + dir=exp/tri7uc-sat_mmi_b0.1 + graph=exp/tri7uc-sat/graph_${lang_test#*lang_} + steps/train_mmi.sh --cmd "$decode_cmd" \ + --boost 0.1 --num-iters $num_mmi_iters \ + $train_bn $lang exp/tri7uc-sat_{ali,denlats}_nodup ${dir} + for iter in 1 2 3 4; do + steps/decode.sh --nj 30 --cmd "$decode_cmd" --acwt 0.05 \ + --config conf/decode.config --iter $iter \ + --transform-dir exp/tri7uc-sat/decode_$(basename $dev_bn)_$(basename $graph) \ + $graph $dev_bn $dir/decode_$(basename $dev_bn)_$(basename $graph)_it${iter} + done fi -# bMMI starting from system in tandem2uc-tri4b, use full dataset. if [ $stage -le 11 ]; then - steps/tandem/align_fmllr.sh --nj 100 --cmd "$train_cmd" \ - data/train_nodup $bndata/train_nodup data/lang exp/tandem2uc-tri4b exp/tandem2uc-tri4b_ali || exit 1; - steps/tandem/make_denlats.sh --nj 40 --cmd "$decode_cmd" --transform-dir exp/tandem2uc-tri4b_ali \ - --sub-split 100 data/train_nodup $bndata/train_nodup data/lang exp/tandem2uc-tri4b exp/tandem2uc-tri4b_denlats || exit 1; -fi -if [ $stage -le 12 ]; then - steps/tandem/train_mmi.sh --cmd "$decode_cmd" --boost 0.1 --acwt 0.039 \ - data/train_nodup $bndata/train_nodup data/lang exp/tandem2uc-tri4b_{ali,denlats} exp/tandem2uc-tri4b_mmi_b0.1 || exit 1; - - steps/tandem/decode.sh --nj 30 --cmd "$decode_cmd" --config conf/decode_tandem.config \ - --transform-dir exp/tandem2uc-tri4b/decode_eval2000 \ - exp/tandem2uc-tri4b/graph data/eval2000 $bndata/eval2000 exp/tandem2uc-tri4b_mmi_b0.1/decode_eval2000 || exit 1; + if $has_fisher; then + # Rescore with the 4gram swbd+fisher language model. + dir=exp/tri7uc-sat_mmi_b0.1 + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/eval2000 \ + $dir/decode_eval2000_graph_sw1_{tg,fsh_fg}_it4 + fi fi -echo success -exit 0 +echo Done. diff --git a/egs/swbd/s5c/local/nnet2/run_nnet2.sh b/egs/swbd/s5c/local/nnet2/run_nnet2.sh index 0872560337b..e83c587a006 100755 --- a/egs/swbd/s5c/local/nnet2/run_nnet2.sh +++ b/egs/swbd/s5c/local/nnet2/run_nnet2.sh @@ -5,7 +5,7 @@ # units, on top of fMLLR features, on GPU. temp_dir= -dir=exp/nnet2_5 +dir=nnet2_5 has_fisher=true . ./cmd.sh @@ -18,10 +18,10 @@ parallel_opts="--gpu 1" # This is suitable for the CLSP network, you'll ( if [ ! -f exp/$dir/final.mdl ]; then - if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d exp/$dir/egs/storage ]; then # spread the egs over various machines. utils/create_split_dir.pl \ - /export/b0{1,2,3,4}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + /export/b0{1,2,3,4}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/exp/$dir/egs/storage exp/$dir/egs/storage fi steps/nnet2/train_pnorm_accel2.sh --parallel-opts "$parallel_opts" \ diff --git a/egs/swbd/s5c/local/nnet3/run_blstm_discriminative.sh b/egs/swbd/s5c/local/nnet3/run_blstm_discriminative.sh new file mode 100755 index 00000000000..32494afe47b --- /dev/null +++ b/egs/swbd/s5c/local/nnet3/run_blstm_discriminative.sh @@ -0,0 +1,203 @@ +#!/bin/bash + +set -o pipefail +set -e +# this is run_discriminative.sh + +# This script does discriminative training on top of CE BLSTM system. +# note: this relies on having a cluster that has plenty of CPUs as well as GPUs, +# since the lattice generation runs in about real-time, so takes of the order of +# 1000 hours of CPU time. +# +. cmd.sh + + +stage=0 +train_stage=-10 # can be used to start training in the middle. +get_egs_stage=-10 +use_gpu=true # for training +cleanup=false # run with --cleanup true --stage 6 to clean up (remove large things like denlats, + # alignments and degs). + +frames_per_chunk=150 +# The contexts here must match the one used for training +extra_left_context=40 +extra_right_context=40 +extra_left_context_initial=-1 +extra_right_context_final=-1 + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +srcdir=exp/nnet3/lstm_bidirectional_ld0 +train_data_dir=data/train_nodup_sp_hires +online_ivector_dir=exp/nnet3/ivectors_train_nodup_sp +degs_dir= # If provided, will skip the degs directory creation +lats_dir= # If provided, will skip denlats creation + +## Objective options +criterion=smbr +one_silence_class=true + +dir=${srcdir}_${criterion} + +## Egs options +frames_per_eg=150 +frames_overlap_per_eg=30 +truncate_deriv_weights=10 + +## Nnet training options +effective_learning_rate=0.0000125 +max_param_change=1 +num_jobs_nnet=4 +num_epochs=4 +regularization_opts= # Applicable for providing --xent-regularize and --l2-regularize options +minibatch_size=64 +adjust_priors=true # May need to be set to false + # because it does not help in some setups +modify_learning_rates=true +last_layer_factor=0.1 + +## Decode options +decode_start_epoch=1 # can be used to avoid decoding all epochs, e.g. if we decided to run more. + +if $use_gpu; then + if ! cuda-compiled; then + cat < $data_dir/wav.scp_scaled || exit 1; + mv $data_dir/wav.scp_scaled $data_dir/wav.scp + + steps/make_mfcc.sh --nj 70 --mfcc-config conf/mfcc_hires.conf \ + --cmd "$train_cmd" data/${dataset}_hires exp/make_hires/$dataset $mfccdir; + steps/compute_cmvn_stats.sh data/${dataset}_hires exp/make_hires/${dataset} $mfccdir; + + # Remove the small number of utterances that couldn't be extracted for some + # reason (e.g. too short; no such file). + utils/fix_data_dir.sh data/${dataset}_hires; + done + + for dataset in eval2000 train_dev rt03; do + # Create MFCCs for the eval set + utils/copy_data_dir.sh data/$dataset data/${dataset}_hires + steps/make_mfcc.sh --cmd "$train_cmd" --nj 10 --mfcc-config conf/mfcc_hires.conf \ + data/${dataset}_hires exp/make_hires/$dataset $mfccdir; + steps/compute_cmvn_stats.sh data/${dataset}_hires exp/make_hires/$dataset $mfccdir; + utils/fix_data_dir.sh data/${dataset}_hires # remove segments with problems + done + + # Take the first 30k utterances (about 1/8th of the data) this will be used + # for the diagubm training + utils/subset_data_dir.sh --first data/${train_set}_hires 30000 data/${train_set}_30k_hires + local/remove_dup_utts.sh 200 data/${train_set}_30k_hires data/${train_set}_30k_nodup_hires # 33hr +fi + +# ivector extractor training +if [ $stage -le 5 ]; then + # We need to build a small system just because we need the LDA+MLLT transform + # to train the diag-UBM on top of. We use --num-iters 13 because after we get + # the transform (12th iter is the last), any further training is pointless. + # this decision is based on fisher_english + steps/train_lda_mllt.sh --cmd "$train_cmd" --num-iters 13 \ + --splice-opts "--left-context=3 --right-context=3" \ + 5500 90000 data/train_100k_nodup_hires \ + data/lang_nosp exp/tri2_ali_100k_nodup exp/nnet3/tri3b +fi + +if [ $stage -le 6 ]; then + # To train a diagonal UBM we don't need very much data, so use the smallest subset. + steps/online/nnet2/train_diag_ubm.sh --cmd "$train_cmd" --nj 30 --num-frames 200000 \ + data/${train_set}_30k_nodup_hires 512 exp/nnet3/tri3b exp/nnet3/diag_ubm +fi + +if [ $stage -le 7 ]; then + # iVector extractors can be sensitive to the amount of data, but this one has a + # fairly small dim (defaults to 100) so we don't use all of it, we use just the + # 100k subset (just under half the data). + steps/online/nnet2/train_ivector_extractor.sh --cmd "$train_cmd" --nj 10 \ + data/train_100k_nodup_hires exp/nnet3/diag_ubm exp/nnet3/extractor || exit 1; +fi + +if [ $stage -le 8 ]; then + # We extract iVectors on all the train_nodup data, which will be what we + # train the system on. + + # having a larger number of speakers is helpful for generalization, and to + # handle per-utterance decoding well (iVector starts at zero). + steps/online/nnet2/copy_data_dir.sh --utts-per-spk-max 2 data/${train_set}_hires data/${train_set}_max2_hires + + steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 30 \ + data/${train_set}_max2_hires exp/nnet3/extractor exp/nnet3/ivectors_$train_set || exit 1; + + for data_set in eval2000 train_dev rt03; do + steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 30 \ + data/${data_set}_hires exp/nnet3/extractor exp/nnet3/ivectors_$data_set || exit 1; + done +fi + +exit 0; diff --git a/egs/swbd/s5c/local/nnet3/run_lstm.sh b/egs/swbd/s5c/local/nnet3/run_lstm.sh new file mode 100755 index 00000000000..11fc851cb71 --- /dev/null +++ b/egs/swbd/s5c/local/nnet3/run_lstm.sh @@ -0,0 +1,168 @@ +#!/bin/bash + +# Copyright 2015 Johns Hopkins University (Author: Daniel Povey). +# 2015 Vijayaditya Peddinti +# 2015 Xingyu Na +# 2015 Pegah Ghahrmani +# Apache 2.0. + + +# this is a basic lstm script +# LSTM script runs for more epochs than the TDNN script +# and each epoch takes twice the time + +# At this script level we don't support not running on GPU, as it would be painfully slow. +# If you want to run without GPU you'd have to call lstm/train.sh with --gpu false + +stage=0 +train_stage=-10 +has_fisher=true +affix= +speed_perturb=true +common_egs_dir= +reporting_email= + +# LSTM options +splice_indexes="-2,-1,0,1,2 0 0" +lstm_delay=" -1 -2 -3 " +label_delay=5 +num_lstm_layers=3 +cell_dim=1024 +hidden_dim=1024 +recurrent_projection_dim=256 +non_recurrent_projection_dim=256 +chunk_width=20 +chunk_left_context=40 +chunk_right_context=0 + + +# training options +num_epochs=8 +initial_effective_lrate=0.0003 +final_effective_lrate=0.00003 +num_jobs_initial=3 +num_jobs_final=15 +momentum=0.5 +num_chunk_per_minibatch=100 +samples_per_iter=20000 +remove_egs=true + +#decode options +extra_left_context= +extra_right_context= +frames_per_chunk= + +# End configuration section. + +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <" + echo "See comments in the script for more details" + exit 1 +fi + +sdir=$1 +[ ! -d $sdir/data/audio/eval03/english/cts ] \ + && echo Expecting directory $sdir/data/audio/eval03/english/cts to be present && exit 1; +[ ! -d $sdir/data/references/eval03/english/cts ] \ + && echo Expecting directory $tdir/data/references/eval03/english/cts to be present && exit 1; + +. path.sh + +dir=data/local/rt03 +mkdir -p $dir + +rtroot=$sdir +tdir=$sdir/data/references/eval03/english/cts +sdir=$sdir/data/audio/eval03/english/cts + +find $sdir -iname '*.sph' | sort > $dir/sph.flist +sed -e 's?.*/??' -e 's?.sph??' $dir/sph.flist | paste - $dir/sph.flist \ + > $dir/sph.scp + +sph2pipe=$KALDI_ROOT/tools/sph2pipe_v2.5/sph2pipe +[ ! -x $sph2pipe ] \ + && echo "Could not execute the sph2pipe program at $sph2pipe" && exit 1; + +awk -v sph2pipe=$sph2pipe '{ + printf("%s-A %s -f wav -p -c 1 %s |\n", $1, sph2pipe, $2); + printf("%s-B %s -f wav -p -c 2 %s |\n", $1, sph2pipe, $2); +}' < $dir/sph.scp | sort > $dir/wav.scp || exit 1; +#side A - channel 1, side B - channel 2 + +# Get segments file... +# segments file format is: utt-id side-id start-time end-time, e.g.: +# sw02001-A_000098-001156 sw02001-A 0.98 11.56 +#pem=$sdir/english/hub5e_00.pem +#[ ! -f $pem ] && echo "No such file $pem" && exit 1; +# pem file has lines like: +# en_4156 A unknown_speaker 301.85 302.48 + +#grep -v ';;' $pem \ +cat $tdir/*.stm | grep -v ';;' | grep -v inter_segment_gap \ + | awk '{ + spk=$1"-"(($2==1)?"A":"B"); + utt=sprintf("%s_%06d-%06d",spk,$4*100,$5*100); + print utt,spk,$4,$5;}' \ + | sort -u > $dir/segments + +# stm file has lines like: +# en_4156 A en_4156_A 357.64 359.64 HE IS A POLICE OFFICER +# TODO(arnab): We should really be lowercasing this since the Edinburgh +# recipe uses lowercase. This is not used in the actual scoring. +#grep -v ';;' $tdir/reference/hub5e00.english.000405.stm \ +cat $tdir/*.stm | grep -v ';;' | grep -v inter_segment_gap \ + | awk '{ + spk=$1"-"(($2==1)?"A":"B"); + utt=sprintf("%s_%06d-%06d",spk,$4*100,$5*100); + printf utt; for(n=7;n<=NF;n++) printf(" %s", $n); print ""; }' \ + | sort > $dir/text.all + +# We'll use the stm file for sclite scoring. There seem to be various errors +# in the stm file that upset hubscr.pl, and we fix them here. +cat $tdir/*.stm | \ + sed -e 's:((:(:' -e 's:::g' -e 's:::g' | \ + grep -v inter_segment_gap | \ + awk '{ + printf $1; if ($1==";;") printf(" %s",$2); else printf(($2==1)?" A":" B"); for(n=3;n<=NF;n++) printf(" %s", $n); print ""; }'\ + > $dir/stm +#$tdir/reference/hub5e00.english.000405.stm > $dir/stm +cp $rtroot/data/trans_rules/en20030506.glm $dir/glm + +# next line uses command substitution +# Just checking that the segments are the same in pem vs. stm. +! cmp <(awk '{print $1}' $dir/text.all) <(awk '{print $1}' $dir/segments) && \ + echo "Segments from pem file and stm file do not match." && exit 1; + +grep -v IGNORE_TIME_SEGMENT_ $dir/text.all > $dir/text + +# create an utt2spk file that assumes each conversation side is +# a separate speaker. +awk '{print $1,$2;}' $dir/segments > $dir/utt2spk +utils/utt2spk_to_spk2utt.pl $dir/utt2spk > $dir/spk2utt + +# cp $dir/segments $dir/segments.tmp +# awk '{x=$3-0.05; if (x<0.0) x=0.0; y=$4+0.05; print $1, $2, x, y; }' \ +# $dir/segments.tmp > $dir/segments + +awk '{print $1}' $dir/wav.scp \ + | perl -ane '$_ =~ m:^(\S+)-([AB])$: || die "bad label $_"; + print "$1-$2 $1 $2\n"; ' \ + > $dir/reco2file_and_channel || exit 1; + +dest=data/rt03 +mkdir -p $dest +for x in wav.scp segments text utt2spk spk2utt stm glm reco2file_and_channel; do + cp $dir/$x $dest/$x +done + +echo Data preparation and formatting completed for RT-03 +echo "(but not MFCC extraction)" + diff --git a/egs/swbd/s5c/local/score.sh b/egs/swbd/s5c/local/score.sh index 81455d1e13a..40a49d0b41a 100755 --- a/egs/swbd/s5c/local/score.sh +++ b/egs/swbd/s5c/local/score.sh @@ -13,6 +13,7 @@ stage=0 min_lmwt=5 max_lmwt=20 reverse=false +iter=final word_ins_penalty=0.0,0.5,1.0 #end configuration section. diff --git a/egs/swbd/s5c/local/score_basic.sh b/egs/swbd/s5c/local/score_basic.sh index aaaf005ceba..8fed1b3bab7 100755 --- a/egs/swbd/s5c/local/score_basic.sh +++ b/egs/swbd/s5c/local/score_basic.sh @@ -6,6 +6,7 @@ cmd=run.pl min_lmwt=5 max_lmwt=20 reverse=false +iter=final word_ins_penalty=0.0,0.5,1.0 #end configuration section. @@ -26,9 +27,9 @@ data=$1 lang=$2 # Note: may be graph directory not lang directory, but has the necessary stuff copied. dir=$3 -model=$dir/../final.mdl # assume model one level up from decoding dir. +model=$dir/../$iter.mdl # assume model one level up from decoding dir. -hubscr=$KALDI_ROOT/tools/sctk/bin/hubscr.pl +hubscr=$KALDI_ROOT/tools/sctk/bin/hubscr.pl [ ! -f $hubscr ] && echo "Cannot find scoring program at $hubscr" && exit 1; hubdir=`dirname $hubscr` @@ -42,10 +43,10 @@ mkdir -p $dir/scoring/log function filter_text { - perl -e 'foreach $w (@ARGV) { $bad{$w} = 1; } + perl -e 'foreach $w (@ARGV) { $bad{$w} = 1; } while() { @A = split(" ", $_); $id = shift @A; print "$id "; foreach $a (@A) { if (!defined $bad{$a}) { print "$a "; }} print "\n"; }' \ - '[NOISE]' '[LAUGHTER]' '[VOCALIZED-NOISE]' '' '%HESITATION' + '[noise]' '[laughter]' '[vocalized-noise]' '' '%hesitation' } for wip in $(echo $word_ins_penalty | sed 's/,/ /g'); do diff --git a/egs/swbd/s5c/local/score_sclite.sh b/egs/swbd/s5c/local/score_sclite.sh index 847e7625015..7ac33fdd26a 100755 --- a/egs/swbd/s5c/local/score_sclite.sh +++ b/egs/swbd/s5c/local/score_sclite.sh @@ -7,6 +7,7 @@ stage=0 min_lmwt=5 max_lmwt=20 reverse=false +iter=final word_ins_penalty=0.0,0.5,1.0 #end configuration section. @@ -28,9 +29,9 @@ data=$1 lang=$2 # Note: may be graph directory not lang directory, but has the necessary stuff copied. dir=$3 -model=$dir/../final.mdl # assume model one level up from decoding dir. +model=$dir/../$iter.mdl # assume model one level up from decoding dir. -hubscr=$KALDI_ROOT/tools/sctk/bin/hubscr.pl +hubscr=$KALDI_ROOT/tools/sctk/bin/hubscr.pl [ ! -f $hubscr ] && echo "Cannot find scoring program at $hubscr" && exit 1; hubdir=`dirname $hubscr` @@ -43,32 +44,39 @@ name=`basename $data`; # e.g. eval2000 mkdir -p $dir/scoring/log +align_word= +reorder_opt= +if $reverse; then + align_word="lattice-reverse ark:- ark:- |" + reorder_opt="--reorder=false" +fi + + +if [ -f $dir/../frame_shift ]; then + frame_shift_opt="--frame-shift=$(cat $dir/../frame_shift)" + echo "$0: $dir/../frame_shift exists, using $frame_shift_opt" +elif [ -f $dir/../frame_subsampling_factor ]; then + factor=$(cat $dir/../frame_subsampling_factor) || exit 1 + frame_shift_opt="--frame-shift=0.0$factor" + echo "$0: $dir/../frame_subsampling_factor exists, using $frame_shift_opt" +fi + +name=`basename $data`; # e.g. eval2000 + +mkdir -p $dir/scoring/log + if [ $stage -le 0 ]; then for wip in $(echo $word_ins_penalty | sed 's/,/ /g'); do - if $reverse; then - $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/get_ctm.LMWT.${wip}.log \ - mkdir -p $dir/score_LMWT_${wip}/ '&&' \ - lattice-scale --lm-scale=LMWT "ark:gunzip -c $dir/lat.*.gz|" ark:- \| \ - lattice-add-penalty --word-ins-penalty=$wip ark:- ark:- \| \ - lattice-1best ark:- ark:- \| \ - lattice-reverse ark:- ark:- \| \ - lattice-align-words --reorder=false $lang/phones/word_boundary.int $model ark:- ark:- \| \ - nbest-to-ctm ark:- - \| \ - utils/int2sym.pl -f 5 $lang/words.txt \| \ - utils/convert_ctm.pl $data/segments $data/reco2file_and_channel \ - '>' $dir/score_LMWT_${wip}/$name.ctm || exit 1; - else - $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/get_ctm.LMWT.${wip}.log \ - mkdir -p $dir/score_LMWT_${wip}/ '&&' \ - lattice-scale --lm-scale=LMWT "ark:gunzip -c $dir/lat.*.gz|" ark:- \| \ - lattice-add-penalty --word-ins-penalty=$wip ark:- ark:- \| \ - lattice-1best ark:- ark:- \| \ - lattice-align-words $lang/phones/word_boundary.int $model ark:- ark:- \| \ - nbest-to-ctm ark:- - \| \ - utils/int2sym.pl -f 5 $lang/words.txt \| \ - utils/convert_ctm.pl $data/segments $data/reco2file_and_channel \ - '>' $dir/score_LMWT_${wip}/$name.ctm || exit 1; - fi + $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/get_ctm.LMWT.${wip}.log \ + mkdir -p $dir/score_LMWT_${wip}/ '&&' \ + lattice-scale --lm-scale=LMWT "ark:gunzip -c $dir/lat.*.gz|" ark:- \| \ + lattice-add-penalty --word-ins-penalty=$wip ark:- ark:- \| \ + lattice-1best ark:- ark:- \| \ + lattice-align-words $reorder_opt $lang/phones/word_boundary.int $model ark:- ark:- \| \ + nbest-to-ctm $frame_shift_opt ark:- - \| \ + utils/int2sym.pl -f 5 $lang/words.txt \| \ + utils/convert_ctm.pl $data/segments $data/reco2file_and_channel \ + '>' $dir/score_LMWT_${wip}/$name.ctm || exit 1; done fi @@ -93,7 +101,7 @@ if [ $stage -le 1 ]; then fi # Score the set... -if [ $stage -le 2 ]; then +if [ $stage -le 2 ]; then for wip in $(echo $word_ins_penalty | sed 's/,/ /g'); do $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/score.LMWT.${wip}.log \ cp $data/stm $dir/score_LMWT_${wip}/ '&&' \ @@ -102,23 +110,45 @@ if [ $stage -le 2 ]; then fi # For eval2000 score the subsets -case "$name" in eval2000* ) - # Score only the, swbd part... - if [ $stage -le 3 ]; then +case "$name" in + eval2000*) + # Score only the, swbd part... + if [ $stage -le 3 ]; then + for wip in $(echo $word_ins_penalty | sed 's/,/ /g'); do + $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/score.swbd.LMWT.${wip}.log \ + grep -v '^en_' $data/stm '>' $dir/score_LMWT_${wip}/stm.swbd '&&' \ + grep -v '^en_' $dir/score_LMWT_${wip}/${name}.ctm '>' $dir/score_LMWT_${wip}/${name}.ctm.swbd '&&' \ + $hubscr -p $hubdir -V -l english -h hub5 -g $data/glm -r $dir/score_LMWT_${wip}/stm.swbd $dir/score_LMWT_${wip}/${name}.ctm.swbd || exit 1; + done + fi + # Score only the, callhome part... + if [ $stage -le 3 ]; then + for wip in $(echo $word_ins_penalty | sed 's/,/ /g'); do + $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/score.callhm.LMWT.${wip}.log \ + grep -v '^sw_' $data/stm '>' $dir/score_LMWT_${wip}/stm.callhm '&&' \ + grep -v '^sw_' $dir/score_LMWT_${wip}/${name}.ctm '>' $dir/score_LMWT_${wip}/${name}.ctm.callhm '&&' \ + $hubscr -p $hubdir -V -l english -h hub5 -g $data/glm -r $dir/score_LMWT_${wip}/stm.callhm $dir/score_LMWT_${wip}/${name}.ctm.callhm || exit 1; + done + fi + ;; +rt03* ) + + # Score only the swbd part... + if [ $stage -le 3 ]; then for wip in $(echo $word_ins_penalty | sed 's/,/ /g'); do $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/score.swbd.LMWT.${wip}.log \ - grep -v '^en_' $data/stm '>' $dir/score_LMWT_${wip}/stm.swbd '&&' \ - grep -v '^en_' $dir/score_LMWT_${wip}/${name}.ctm '>' $dir/score_LMWT_${wip}/${name}.ctm.swbd '&&' \ + grep -v '^fsh_' $data/stm '>' $dir/score_LMWT_${wip}/stm.swbd '&&' \ + grep -v '^fsh_' $dir/score_LMWT_${wip}/${name}.ctm '>' $dir/score_LMWT_${wip}/${name}.ctm.swbd '&&' \ $hubscr -p $hubdir -V -l english -h hub5 -g $data/glm -r $dir/score_LMWT_${wip}/stm.swbd $dir/score_LMWT_${wip}/${name}.ctm.swbd || exit 1; done fi - # Score only the, callhome part... - if [ $stage -le 3 ]; then + # Score only the fisher part... + if [ $stage -le 3 ]; then for wip in $(echo $word_ins_penalty | sed 's/,/ /g'); do - $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/score.callhm.LMWT.${wip}.log \ - grep -v '^sw_' $data/stm '>' $dir/score_LMWT_${wip}/stm.callhm '&&' \ - grep -v '^sw_' $dir/score_LMWT_${wip}/${name}.ctm '>' $dir/score_LMWT_${wip}/${name}.ctm.callhm '&&' \ - $hubscr -p $hubdir -V -l english -h hub5 -g $data/glm -r $dir/score_LMWT_${wip}/stm.callhm $dir/score_LMWT_${wip}/${name}.ctm.callhm || exit 1; + $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/score.fsh.LMWT.${wip}.log \ + grep -v '^sw_' $data/stm '>' $dir/score_LMWT_${wip}/stm.fsh '&&' \ + grep -v '^sw_' $dir/score_LMWT_${wip}/${name}.ctm '>' $dir/score_LMWT_${wip}/${name}.ctm.fsh '&&' \ + $hubscr -p $hubdir -V -l english -h hub5 -g $data/glm -r $dir/score_LMWT_${wip}/stm.fsh $dir/score_LMWT_${wip}/${name}.ctm.fsh || exit 1; done fi ;; diff --git a/egs/swbd/s5c/local/swbd1_data_download.sh b/egs/swbd/s5c/local/swbd1_data_download.sh index 00ec97c5028..d8f076b5141 100755 --- a/egs/swbd/s5c/local/swbd1_data_download.sh +++ b/egs/swbd/s5c/local/swbd1_data_download.sh @@ -10,18 +10,11 @@ ## you unpacked this. We are just doing a "find" command to locate ## the .sph files. -## The second input is optional, which should point to a directory containing -## Switchboard transcriptions/documentations (specifically, the conv.tab file). -## If specified, the script will try to use the actual speaker PINs provided -## with the corpus instead of the conversation side ID (Kaldi default). We -## will be using "find" to locate this file so we don't make any assumptions -## on the directory structure. (Peng Qi, Aug 2014) - . path.sh #check existing directories -if [ $# != 1 -a $# != 2 ]; then - echo "Usage: swbd1_data_prep_edin.sh /path/to/SWBD [/path/to/SWBD_DOC]" +if [ $# != 1 ]; then + echo "Usage: swbd1_data_download.sh /path/to/SWBD" exit 1; fi @@ -30,24 +23,19 @@ SWBD_DIR=$1 dir=data/local/train mkdir -p $dir - # Audio data directory check if [ ! -d $SWBD_DIR ]; then echo "Error: run.sh requires a directory argument" exit 1; fi -sph2pipe=$KALDI_ROOT/tools/sph2pipe_v2.5/sph2pipe -[ ! -x $sph2pipe ] \ - && echo "Could not execute the sph2pipe program at $sph2pipe" && exit 1; - - # Trans directory check if [ ! -d $SWBD_DIR/transcriptions/swb_ms98_transcriptions ]; then ( cd $dir; if [ ! -d swb_ms98_transcriptions ]; then echo " *** Downloading trascriptions and dictionary ***" + wget http://www.openslr.org/resources/5/switchboard_word_alignments.tar.gz || wget http://www.isip.piconepress.com/projects/switchboard/releases/switchboard_word_alignments.tar.gz tar -xf switchboard_word_alignments.tar.gz fi diff --git a/egs/swbd/s5c/local/swbd1_data_prep.sh b/egs/swbd/s5c/local/swbd1_data_prep.sh index 57fb0ff56c8..9621e7fc06e 100755 --- a/egs/swbd/s5c/local/swbd1_data_prep.sh +++ b/egs/swbd/s5c/local/swbd1_data_prep.sh @@ -21,7 +21,7 @@ #check existing directories if [ $# != 1 -a $# != 2 ]; then - echo "Usage: swbd1_data_prep_edin.sh /path/to/SWBD [/path/to/SWBD_DOC]" + echo "Usage: swbd1_data_prep.sh /path/to/SWBD [/path/to/SWBD_DOC]" exit 1; fi @@ -41,23 +41,6 @@ sph2pipe=$KALDI_ROOT/tools/sph2pipe_v2.5/sph2pipe [ ! -x $sph2pipe ] \ && echo "Could not execute the sph2pipe program at $sph2pipe" && exit 1; - -# Trans directory check -if [ ! -d $SWBD_DIR/transcriptions/swb_ms98_transcriptions ]; then - ( - cd $dir; - if [ ! -d swb_ms98_transcriptions ]; then - echo " *** Downloading trascriptions and dictionary ***" - wget http://www.isip.piconepress.com/projects/switchboard/releases/switchboard_word_alignments.tar.gz - tar -xf switchboard_word_alignments.tar.gz - fi - ) -else - echo "Directory with transcriptions exists, skipping downloading" - [ -f $dir/swb_ms98_transcriptions ] \ - || ln -sf $SWBD_DIR/transcriptions/swb_ms98_transcriptions $dir/ -fi - # Option A: SWBD dictionary file check [ ! -f $dir/swb_ms98_transcriptions/sw-ms98-dict.text ] && \ echo "SWBD dictionary file does not exist" && exit 1; diff --git a/egs/swbd/s5c/path.sh b/egs/swbd/s5c/path.sh index db666cc10f6..1bea0e69779 100755 --- a/egs/swbd/s5c/path.sh +++ b/egs/swbd/s5c/path.sh @@ -1,4 +1,6 @@ export KALDI_ROOT=`pwd`/../../.. -export PATH=$PWD/utils/:$KALDI_ROOT/src/bin:$KALDI_ROOT/tools/openfst/bin:$KALDI_ROOT/src/fstbin/:$KALDI_ROOT/src/gmmbin/:$KALDI_ROOT/src/featbin/:$KALDI_ROOT/src/lm/:$KALDI_ROOT/src/sgmmbin/:$KALDI_ROOT/src/sgmm2bin/:$KALDI_ROOT/src/fgmmbin/:$KALDI_ROOT/src/latbin/:$KALDI_ROOT/src/nnetbin:$KALDI_ROOT/src/nnet2bin:$KALDI_ROOT/src/online2bin/:$KALDI_ROOT/src/ivectorbin/:$KALDI_ROOT/src/lmbin/:$PWD:$PATH -#$KALDI_ROOT/tools/srilm/bin:$KALDI_ROOT/tools/srilm/bin/i686-m64:$KALDI_ROOT/tools/srilm/bin/i686:$PATH +export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH +[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1 +. $KALDI_ROOT/tools/config/common_path.sh + export LC_ALL=C diff --git a/egs/swbd/s5c/run.sh b/egs/swbd/s5c/run.sh index afe561f881f..3bc2df0a337 100755 --- a/egs/swbd/s5c/run.sh +++ b/egs/swbd/s5c/run.sh @@ -7,11 +7,13 @@ # 1. added more training data for early stages # 2. removed SAT system (and later stages) on the 100k utterance training data # 3. reduced number of LM rescoring, only sw1_tg and sw1_fsh_fg remain -# 4. mapped swbd transcription to fisher style, instead of the other way around +# 4. mapped swbd transcription to fisher style, instead of the other way around set -e # exit on error has_fisher=true local/swbd1_data_download.sh /export/corpora3/LDC/LDC97S62 +# local/swbd1_data_download.sh /mnt/matylda2/data/SWITCHBOARD_1R2 # BUT, + # prepare SWBD dictionary first since we want to find acronyms according to pronunciations # before mapping lexicon and transcripts local/swbd1_prepare_dict.sh @@ -20,7 +22,7 @@ local/swbd1_prepare_dict.sh # which specifies the directory to Switchboard documentations. Specifically, if # this argument is given, the script will look for the conv.tab file and correct # speaker IDs to the actual speaker personal identification numbers released in -# the documentations. The documentations can be found here: +# the documentations. The documentations can be found here: # https://catalog.ldc.upenn.edu/docs/LDC97S62/ # Note: if you are using this link, make sure you rename conv_tab.csv to conv.tab # after downloading. @@ -28,24 +30,22 @@ local/swbd1_prepare_dict.sh local/swbd1_data_prep.sh /export/corpora3/LDC/LDC97S62 # local/swbd1_data_prep.sh /home/dpovey/data/LDC97S62 # local/swbd1_data_prep.sh /data/corpora0/LDC97S62 -# local/swbd1_data_prep.sh /mnt/matylda2/data/SWITCHBOARD_1R2 +# local/swbd1_data_prep.sh /mnt/matylda2/data/SWITCHBOARD_1R2 # BUT, # local/swbd1_data_prep.sh /exports/work/inf_hcrc_cstr_general/corpora/switchboard/switchboard1 utils/prepare_lang.sh data/local/dict_nosp \ "" data/local/lang_nosp data/lang_nosp # Now train the language models. We are using SRILM and interpolating with an -# LM trained on the Fisher transcripts (part 2 disk is currently missing; so +# LM trained on the Fisher transcripts (part 2 disk is currently missing; so # only part 1 transcripts ~700hr are used) # If you have the Fisher data, you can set this "fisher_dir" variable. fisher_dirs="/export/corpora3/LDC/LDC2004T19/fe_03_p1_tran/ /export/corpora3/LDC/LDC2005T19/fe_03_p2_tran/" -#fisher_dirs="/home/dpovey/data/LDC2004T19/fe_03_p1_tran/" -#fisher_dirs="/data/corpora0/LDC2004T19/fe_03_p1_tran/" -# edinburgh: -# fisher_dirs="/exports/work/inf_hcrc_cstr_general/corpora/fisher/transcripts" -# brno: -# fisher_dirs="/mnt/matylda2/data/FISHER/fe_03_p1_tran" # BUT +# fisher_dirs="/home/dpovey/data/LDC2004T19/fe_03_p1_tran/" +# fisher_dirs="/data/corpora0/LDC2004T19/fe_03_p1_tran/" +# fisher_dirs="/exports/work/inf_hcrc_cstr_general/corpora/fisher/transcripts" # Edinburgh, +# fisher_dirs="/mnt/matylda2/data/FISHER/fe_03_p1_tran /mnt/matylda2/data/FISHER/fe_03_p2_tran" # BUT, local/swbd1_train_lms.sh data/local/train/text \ data/local/dict_nosp/lexicon.txt data/local/lm $fisher_dirs @@ -79,7 +79,7 @@ mfccdir=mfcc for x in train eval2000; do steps/make_mfcc.sh --nj 50 --cmd "$train_cmd" \ data/$x exp/make_mfcc/$x $mfccdir - steps/compute_cmvn_stats.sh data/$x exp/make_mfcc/$x $mfccdir + steps/compute_cmvn_stats.sh data/$x exp/make_mfcc/$x $mfccdir utils/fix_data_dir.sh data/$x done @@ -91,11 +91,10 @@ utils/subset_data_dir.sh --first data/train 4000 data/train_dev # 5hr 6min n=$[`cat data/train/segments | wc -l` - 4000] utils/subset_data_dir.sh --last data/train $n data/train_nodev -# Now-- there are 260k utterances (313hr 23min), and we want to start the -# monophone training on relatively short utterances (easier to align), but not -# only the shortest ones (mostly uh-huh). So take the 100k shortest ones; -# remove most of the repeated utterances (these are the uh-huh type ones), and -# then take 10k random utterances from those (about 4hr 40mins) +# Now-- there are 260k utterances (313hr 23min), and we want to start the +# monophone training on relatively short utterances (easier to align), but not +# only the shortest ones (mostly uh-huh). So take the 100k shortest ones, and +# then take 30k random utterances from those (about 12hr) utils/subset_data_dir.sh --shortest data/train_nodev 100000 data/train_100kshort utils/subset_data_dir.sh data/train_100kshort 30000 data/train_30kshort @@ -108,13 +107,13 @@ local/remove_dup_utts.sh 200 data/train_100k data/train_100k_nodup # 110hr local/remove_dup_utts.sh 300 data/train_nodev data/train_nodup # 286hr ## Starting basic training on MFCC features steps/train_mono.sh --nj 30 --cmd "$train_cmd" \ - data/train_30kshort data/lang_nosp exp/mono + data/train_30kshort data/lang_nosp exp/mono steps/align_si.sh --nj 30 --cmd "$train_cmd" \ - data/train_100k_nodup data/lang_nosp exp/mono exp/mono_ali + data/train_100k_nodup data/lang_nosp exp/mono exp/mono_ali steps/train_deltas.sh --cmd "$train_cmd" \ - 3200 30000 data/train_100k_nodup data/lang_nosp exp/mono_ali exp/tri1 + 3200 30000 data/train_100k_nodup data/lang_nosp exp/mono_ali exp/tri1 ( graph_dir=exp/tri1/graph_nosp_sw1_tg @@ -125,7 +124,7 @@ steps/train_deltas.sh --cmd "$train_cmd" \ ) & steps/align_si.sh --nj 30 --cmd "$train_cmd" \ - data/train_100k_nodup data/lang_nosp exp/tri1 exp/tri1_ali + data/train_100k_nodup data/lang_nosp exp/tri1 exp/tri1_ali steps/train_deltas.sh --cmd "$train_cmd" \ 4000 70000 data/train_100k_nodup data/lang_nosp exp/tri1_ali exp/tri2 @@ -149,11 +148,11 @@ steps/align_si.sh --nj 30 --cmd "$train_cmd" \ # From now, we start using all of the data (except some duplicates of common # utterances, which don't really contribute much). steps/align_si.sh --nj 30 --cmd "$train_cmd" \ - data/train_nodup data/lang_nosp exp/tri2 exp/tri2_ali_nodup + data/train_nodup data/lang_nosp exp/tri2 exp/tri2_ali_nodup # Do another iteration of LDA+MLLT training, on all the data. steps/train_lda_mllt.sh --cmd "$train_cmd" \ - 6000 140000 data/train_nodup data/lang_nosp exp/tri2_ali_nodup exp/tri3 + 6000 140000 data/train_nodup data/lang_nosp exp/tri2_ali_nodup exp/tri3 ( graph_dir=exp/tri3/graph_nosp_sw1_tg @@ -190,7 +189,7 @@ fi # Train tri4, which is LDA+MLLT+SAT, on all the (nodup) data. steps/align_fmllr.sh --nj 30 --cmd "$train_cmd" \ - data/train_nodup data/lang exp/tri3 exp/tri3_ali_nodup + data/train_nodup data/lang exp/tri3 exp/tri3_ali_nodup steps/train_sat.sh --cmd "$train_cmd" \ @@ -203,6 +202,9 @@ steps/train_sat.sh --cmd "$train_cmd" \ steps/decode_fmllr.sh --nj 30 --cmd "$decode_cmd" \ --config conf/decode.config \ $graph_dir data/eval2000 exp/tri4/decode_eval2000_sw1_tg + # Will be used for confidence calibration example, + steps/decode_fmllr.sh --nj 30 --cmd "$decode_cmd" \ + $graph_dir data/train_dev exp/tri4/decode_dev_sw1_tg ) & wait @@ -212,13 +214,13 @@ if $has_fisher; then exp/tri4/decode_eval2000_sw1_{tg,fsh_fg} fi -# MMI training starting from the LDA+MLLT+SAT systems on all the (nodup) data. +# MMI training starting from the LDA+MLLT+SAT systems on all the (nodup) data. steps/align_fmllr.sh --nj 50 --cmd "$train_cmd" \ data/train_nodup data/lang exp/tri4 exp/tri4_ali_nodup steps/make_denlats.sh --nj 50 --cmd "$decode_cmd" \ --config conf/decode.config --transform-dir exp/tri4_ali_nodup \ - data/train_nodup data/lang exp/tri4 exp/tri4_denlats_nodup + data/train_nodup data/lang exp/tri4 exp/tri4_denlats_nodup # 4 iterations of MMI seems to work well overall. The number of iterations is # used as an explicit argument even though train_mmi.sh will use 4 iterations by @@ -226,7 +228,7 @@ steps/make_denlats.sh --nj 50 --cmd "$decode_cmd" \ num_mmi_iters=4 steps/train_mmi.sh --cmd "$decode_cmd" \ --boost 0.1 --num-iters $num_mmi_iters \ - data/train_nodup data/lang exp/tri4_{ali,denlats}_nodup exp/tri4_mmi_b0.1 + data/train_nodup data/lang exp/tri4_{ali,denlats}_nodup exp/tri4_mmi_b0.1 for iter in 1 2 3 4; do ( @@ -245,7 +247,7 @@ if $has_fisher; then ( steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ data/lang_sw1_{tg,fsh_fg} data/eval2000 \ - exp/tri4_mmi_b0.1/decode_eval2000_${iter}.mdl_sw1_fsh_{tg,fsh_fg} + exp/tri4_mmi_b0.1/decode_eval2000_${iter}.mdl_sw1_{tg,fsh_fg} ) & done fi @@ -257,7 +259,7 @@ steps/train_diag_ubm.sh --silence-weight 0.5 --nj 50 --cmd "$train_cmd" \ steps/train_mmi_fmmi.sh --learning-rate 0.005 \ --boost 0.1 --cmd "$train_cmd" \ data/train_nodup data/lang exp/tri4_ali_nodup exp/tri4_dubm \ - exp/tri4_denlats_nodup exp/tri4_fmmi_b0.1 + exp/tri4_denlats_nodup exp/tri4_fmmi_b0.1 for iter in 4 5 6 7 8; do ( @@ -301,5 +303,18 @@ fi # demonstration script for raw-fMLLR. You should probably ignore this. # local/run_raw_fmllr.sh +# nnet3 LSTM recipe +# local/nnet3/run_lstm.sh + +# nnet3 BLSTM recipe +# local/nnet3/run_lstm.sh --affix bidirectional \ +# --lstm-delay " [-1,1] [-2,2] [-3,3] " \ +# --label-delay 0 \ +# --cell-dim 1024 \ +# --recurrent-projection-dim 128 \ +# --non-recurrent-projection-dim 128 \ +# --chunk-left-context 40 \ +# --chunk-right-context 40 + # getting results (see RESULTS file) # for x in 1 2 3a 3b 4a; do grep 'Percent Total Error' exp/tri$x/decode_eval2000_sw1_tg/score_*/eval2000.ctm.filt.dtl | sort -k5 -g | head -1; done diff --git a/egs/swbd/s5c/swbd.perf b/egs/swbd/s5c/swbd.perf new file mode 100644 index 00000000000..5151a6fdaa0 --- /dev/null +++ b/egs/swbd/s5c/swbd.perf @@ -0,0 +1,33 @@ +%WER 12.8 | 1831 21395 | 89.2 7.7 3.2 2.0 12.8 50.4 | exp/chain/tdnn_v_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.swbd.filt.sys +%WER 11.5 | 1831 21395 | 89.8 6.7 3.4 1.4 11.5 47.0 | exp/chain/tdnn_v1_trial4_sp/decode_eval2000_sw1_fsh_fg/score_12_0.0/eval2000_hires.ctm.swbd.filt.sys +%WER 11.5 | 1831 21395 | 89.6 6.6 3.8 1.1 11.5 47.1 | exp/chain/tdnn_v2_trial1_sp/decode_eval2000_200_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.swbd.filt.sys +%WER 11.4 | 1831 21395 | 89.9 6.4 3.7 1.3 11.4 46.9 | exp/chain/tdnn_v1_trial1_sp/decode_eval2000_sw1_fsh_fg/score_12_0.0/eval2000_hires.ctm.swbd.filt.sys +%WER 11.4 | 1831 21395 | 89.8 6.6 3.7 1.2 11.4 47.5 | exp/chain/tdnn_v1_trial5_sp/decode_eval2000_sw1_fsh_fg/score_12_1.0/eval2000_hires.ctm.swbd.filt.sys +%WER 11.3 | 1831 21395 | 90.0 6.6 3.4 1.3 11.3 46.0 | exp/chain/tdnn_v1_trial3_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.swbd.filt.sys +%WER 11.2 | 1831 21395 | 90.1 6.4 3.5 1.3 11.2 46.0 | exp/chain/tdnn_v2_trial4_sp/decode_eval2000_300_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.swbd.filt.sys +%WER 11.1 | 1831 21395 | 90.2 6.3 3.5 1.3 11.1 46.6 | exp/chain/tdnn_v1_trial6_sp/decode_eval2000_sw1_fsh_fg/score_12_0.0/eval2000_hires.ctm.swbd.filt.sys +%WER 11.1 | 1831 21395 | 90.1 6.6 3.3 1.3 11.1 46.6 | exp/chain/tdnn_v2_trial3_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.swbd.filt.sys +%WER 11.1 | 1831 21395 | 90.1 6.5 3.5 1.2 11.1 45.8 | exp/chain/tdnn_v2_trial4_sp/decode_eval2000_400_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.swbd.filt.sys +%WER 11.1 | 1831 21395 | 90.1 6.5 3.4 1.3 11.1 45.7 | exp/chain/tdnn_v2_trial4_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.swbd.filt.sys +%WER 11.1 | 1831 21395 | 90.1 6.4 3.5 1.1 11.1 46.5 | exp/chain/tdnn_v2_trial1_sp/decode_eval2000_300_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.swbd.filt.sys +%WER 11.1 | 1831 21395 | 90.1 6.3 3.6 1.2 11.1 46.4 | exp/chain/tdnn_v2_trial3_sp/decode_eval2000_400_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.swbd.filt.sys +%WER 11.1 | 1831 21395 | 90.0 6.4 3.7 1.1 11.1 46.3 | exp/chain/tdnn_v2_trial8_sp/decode_eval2000_200_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.swbd.filt.sys +%WER 11.1 | 1831 21395 | 90.0 6.4 3.7 1.1 11.1 46.3 | exp/chain/tdnn_v1_trial8_sp/decode_eval2000_200_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.swbd.filt.sys +%WER 11.1 | 1831 21395 | 89.9 6.3 3.8 1.1 11.1 46.1 | exp/chain/tdnn_v2_trial1_sp/decode_eval2000_400_sw1_fsh_fg/score_11_1.0/eval2000_hires.ctm.swbd.filt.sys +%WER 11.0 | 1831 21395 | 90.3 6.4 3.3 1.3 11.0 47.1 | exp/chain/tdnn_v2_trial5_sp/decode_eval2000_400_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.swbd.filt.sys +%WER 11.0 | 1831 21395 | 90.2 6.4 3.3 1.2 11.0 45.8 | exp/chain/tdnn_v2_trial8_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.swbd.filt.sys +%WER 11.0 | 1831 21395 | 90.2 6.4 3.3 1.2 11.0 45.8 | exp/chain/tdnn_v2_trial8_sp/decode_eval2000_400_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.swbd.filt.sys +%WER 11.0 | 1831 21395 | 90.2 6.4 3.3 1.2 11.0 45.8 | exp/chain/tdnn_v1_trial8_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.swbd.filt.sys +%WER 11.0 | 1831 21395 | 90.2 6.4 3.3 1.2 11.0 45.8 | exp/chain/tdnn_v1_trial8_sp/decode_eval2000_400_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.swbd.filt.sys +%WER 11.0 | 1831 21395 | 90.2 6.3 3.5 1.2 11.0 46.5 | exp/chain/tdnn_v2_trial5_sp/decode_eval2000_300_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.swbd.filt.sys +%WER 11.0 | 1831 21395 | 90.2 6.3 3.5 1.2 11.0 46.1 | exp/chain/tdnn_v3_trial1_sp/decode_eval2000_300_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.swbd.filt.sys +%WER 11.0 | 1831 21395 | 90.2 6.2 3.6 1.2 11.0 45.9 | exp/chain/tdnn_v2_trial1_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.swbd.filt.sys +%WER 11.0 | 1831 21395 | 90.2 6.2 3.5 1.2 11.0 46.3 | exp/chain/tdnn_v2_trial3_sp/decode_eval2000_300_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.swbd.filt.sys +%WER 11.0 | 1831 21395 | 90.1 6.3 3.5 1.2 11.0 46.1 | exp/chain/tdnn_v3_trial1_sp/decode_eval2000_400_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.swbd.filt.sys +%WER 10.9 | 1831 21395 | 90.4 6.2 3.4 1.3 10.9 46.6 | exp/chain/tdnn_v2_trial5_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.swbd.filt.sys +%WER 10.9 | 1831 21395 | 90.3 6.3 3.4 1.2 10.9 45.9 | exp/chain/tdnn_v2_trial8_sp/decode_eval2000_300_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.swbd.filt.sys +%WER 10.9 | 1831 21395 | 90.3 6.3 3.4 1.2 10.9 45.9 | exp/chain/tdnn_v1_trial8_sp/decode_eval2000_300_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.swbd.filt.sys +%WER 10.8 | 1831 21395 | 90.4 6.3 3.2 1.3 10.8 46.3 | exp/chain/tdnn_v2_trial2_sp/decode_eval2000_400_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.swbd.filt.sys +%WER 10.8 | 1831 21395 | 90.4 6.1 3.5 1.2 10.8 45.9 | exp/chain/tdnn_v2_trial2_sp/decode_eval2000_300_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.swbd.filt.sys +%WER 10.7 | 1831 21395 | 90.5 6.2 3.3 1.2 10.7 45.1 | exp/chain/tdnn_v3_trial1_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.swbd.filt.sys +%WER 10.6 | 1831 21395 | 90.6 6.2 3.1 1.3 10.6 46.1 | exp/chain/tdnn_v2_trial2_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.swbd.filt.sys diff --git a/egs/swbd/s5c/tdnn_chain.sh b/egs/swbd/s5c/tdnn_chain.sh new file mode 100755 index 00000000000..8b0362f4e90 --- /dev/null +++ b/egs/swbd/s5c/tdnn_chain.sh @@ -0,0 +1,334 @@ +exp=$1 + +if [ $exp -eq 1 ]; then +dir_name=exp/chain/tdnn_v1_trial1_sp/ +mkdir -p $dir_name +for f in 0.trans_mdl cmvn_opts frame_subsampling_factor normalization.fst phone_lm.fst den.fst tree; do + cp exp/chain/tdnn_2o_sp/$f $dir_name +done + + local/chain/tdnn/run_tdnn_v1.sh --affix trial1 \ + --stage 12 \ + --train-stage -5 \ + --common-egs-dir exp/chain/tdnn_2o_sp/egs +fi + +if [ $exp -eq 2 ]; then +# had to reduce the batch size as there were memory issues +# models up to iteration 216 cannot be read anymore as the Read.WRite methods changed +# there are more issues, I am just restarting the experiment +dir_name=exp/chain/tdnn_v1_trial2_sp/ +mkdir -p $dir_name +for f in 0.trans_mdl cmvn_opts frame_subsampling_factor normalization.fst phone_lm.fst den.fst tree; do + cp exp/chain/tdnn_2o_sp/$f $dir_name +done + + local/chain/tdnn/run_tdnn_v1.sh --affix trial2 \ + --stage 12 \ + --train-stage 216 \ + --minibatch-size 64 \ + --pool-type 'low-pass' \ + --pool-lpfilter-width "0.333" \ + --pool-window 7 +fi + + +if [ $exp -eq 3 ]; then + # same as trial1 but with smaller mini-batch size to be used as a control for trial2 +dir_name=exp/chain/tdnn_v1_trial3_sp/ +mkdir -p $dir_name +for f in 0.trans_mdl cmvn_opts frame_subsampling_factor normalization.fst phone_lm.fst den.fst tree; do + cp exp/chain/tdnn_v1_trial1_sp/$f $dir_name +done + + local/chain/tdnn/run_tdnn_v1.sh --affix trial3 \ + --stage 12 \ + --train-stage 298 \ + --minibatch-size 64 \ + --common-egs-dir exp/chain/tdnn_v1_trial2_sp/egs +fi + +if [ $exp -eq 4 ]; then + # same as trial2 but with updatable convolution layers +dir_name=exp/chain/tdnn_v1_trial4_sp/ +mkdir -p $dir_name +for f in 0.trans_mdl cmvn_opts frame_subsampling_factor normalization.fst phone_lm.fst den.fst tree; do + cp exp/chain/tdnn_v1_trial1_sp/$f $dir_name +done + + local/chain/tdnn/run_tdnn_v1.sh --affix trial4 \ + --stage 12 \ + --train-stage 469 \ + --minibatch-size 64 \ + --pool-type 'weighted-average' \ + --pool-lpfilter-width "0.333" \ + --pool-window 7 \ + --common-egs-dir exp/chain/tdnn_v1_trial2_sp/egs +fi + + +if [ $exp -eq 5 ]; then + # this is trial2 just restarted +dir_name=exp/chain/tdnn_v1_trial5_sp/ +mkdir -p $dir_name +for f in 0.trans_mdl cmvn_opts frame_subsampling_factor normalization.fst phone_lm.fst den.fst tree; do + cp exp/chain/tdnn_v1_trial1_sp/$f $dir_name +done + + local/chain/tdnn/run_tdnn_v1.sh --affix trial5 \ + --stage 12 \ + --train-stage 182 \ + --minibatch-size 64 \ + --pool-type 'low-pass' \ + --pool-lpfilter-width "0.333" \ + --pool-window 7 \ + --common-egs-dir exp/chain/tdnn_v1_trial2_sp/egs +fi + + +if [ $exp -eq 6 ]; then + # same as trial2 but with per-dim affine component +dir_name=exp/chain/tdnn_v1_trial6_sp/ +mkdir -p $dir_name +for f in 0.trans_mdl cmvn_opts frame_subsampling_factor normalization.fst phone_lm.fst den.fst tree; do + cp exp/chain/tdnn_v1_trial1_sp/$f $dir_name +done + + local/chain/tdnn/run_tdnn_v1.sh --affix trial6 \ + --stage 12 \ + --train-stage 323 \ + --minibatch-size 64 \ + --pool-type 'per-dim-weighted-average' \ + --pool-window 7 \ + --common-egs-dir exp/chain/tdnn_v1_trial2_sp/egs +fi + + +if [ $exp -eq 7 ]; then + # same as trial2 but with per-dim affine component +dir_name=exp/chain/tdnn_v1_trial7_sp/ +mkdir -p $dir_name +for f in 0.trans_mdl cmvn_opts frame_subsampling_factor normalization.fst phone_lm.fst den.fst tree; do + cp exp/chain/tdnn_v1_trial1_sp/$f $dir_name +done + + local/chain/tdnn/run_tdnn_v1.sh --affix trial7 \ + --stage 12 \ + --train-stage -5 \ + --splice-indexes "-2,-1,0,1,2 -1,0,1 -1,0,1 -1,0,1 -1,0,1 -1,0,1 -1,0,1 -1,0,1 -1,0,1 -1,0,1 -1,0,1 -1,0,1 -1,0 -1,0 -1,0 -1,0 -1,0" \ + --relu-dim 450 \ + --minibatch-size 64 \ + --common-egs-dir exp/chain/tdnn_v1_trial2_sp/egs +fi + + +if [ $exp -eq 8 ]; then + # same as trial2 but with updatable convolution layers +dir_name=exp/chain/tdnn_v1_trial8_sp/ +mkdir -p $dir_name +for f in 0.trans_mdl cmvn_opts frame_subsampling_factor normalization.fst phone_lm.fst den.fst tree; do + cp exp/chain/tdnn_v1_trial1_sp/$f $dir_name +done + + local/chain/tdnn/run_tdnn_v2.sh --affix trial8 \ + --stage 12 \ + --train-stage -5 \ + --minibatch-size 64 \ + --pool-type 'weighted-average' \ + --pool-lpfilter-width "0.333" \ + --pool-window 7 \ + --common-egs-dir exp/chain/tdnn_v1_trial2_sp/egs +fi + +if [ $exp -eq 9 ]; then + # same as trial2 but with updatable convolution layers +dir_name=exp/chain/tdnn_v2_trial1_sp/ +mkdir -p $dir_name +for f in 0.trans_mdl cmvn_opts frame_subsampling_factor normalization.fst phone_lm.fst den.fst tree; do + cp exp/chain/tdnn_v1_trial1_sp/$f $dir_name +done + + local/chain/tdnn/run_tdnn_v2.sh --affix trial1 \ + --stage 12 \ + --train-stage -5 \ + --minibatch-size 64 \ + --pool-type 'low-pass' \ + --pool-lpfilter-width "0.333" \ + --pool-window 7 \ + --common-egs-dir exp/chain/tdnn_v1_trial2_sp/egs +fi + +if [ $exp -eq 10 ]; then +dir_name=exp/chain/tdnn_v2_trial2_sp/ +mkdir -p $dir_name +for f in 0.trans_mdl cmvn_opts frame_subsampling_factor normalization.fst phone_lm.fst den.fst tree; do + cp exp/chain/tdnn_v1_trial1_sp/$f $dir_name +done + + local/chain/tdnn/run_tdnn_v2.sh --affix trial2 \ + --stage 12 \ + --train-stage -5 \ + --minibatch-size 64 \ + --pool-type 'per-dim-weighted-average' \ + --pool-window 7 \ + --common-egs-dir exp/chain/tdnn_v1_trial2_sp/egs +fi + +if [ $exp -eq 11 ]; then +dir_name=exp/chain/tdnn_v2_trial3_sp/ +mkdir -p $dir_name +for f in 0.trans_mdl cmvn_opts frame_subsampling_factor normalization.fst phone_lm.fst den.fst tree; do + cp exp/chain/tdnn_v1_trial1_sp/$f $dir_name +done + + local/chain/tdnn/run_tdnn_v2.sh --affix trial3 \ + --stage 12 \ + --train-stage -5 \ + --relu-dim 500 \ + --minibatch-size 64 \ + --pool-type 'weighted-average' \ + --pool-lpfilter-width "0.333" \ + --pool-window 7 \ + --common-egs-dir exp/chain/tdnn_v1_trial2_sp/egs +fi + + +if [ $exp -eq 12 ]; then +dir_name=exp/chain/tdnn_v2_trial4_sp/ +mkdir -p $dir_name +for f in 0.trans_mdl cmvn_opts frame_subsampling_factor normalization.fst phone_lm.fst den.fst tree; do + cp exp/chain/tdnn_v1_trial1_sp/$f $dir_name +done + + local/chain/tdnn/run_tdnn_v2.sh --affix trial4 \ + --stage 12 \ + --train-stage -5 \ + --relu-dim 500 \ + --minibatch-size 64 \ + --pool-type 'low-pass' \ + --pool-lpfilter-width "0.333" \ + --pool-window 7 \ + --common-egs-dir exp/chain/tdnn_v1_trial2_sp/egs +fi + +if [ $exp -eq 13 ]; then +dir_name=exp/chain/tdnn_v2_trial5_sp/ +mkdir -p $dir_name +for f in 0.trans_mdl cmvn_opts frame_subsampling_factor normalization.fst phone_lm.fst den.fst tree; do + cp exp/chain/tdnn_v1_trial1_sp/$f $dir_name +done + + local/chain/tdnn/run_tdnn_v2.sh --affix trial5 \ + --stage 12 \ + --train-stage -5 \ + --relu-dim 500 \ + --minibatch-size 64 \ + --pool-type 'per-dim-weighted-average' \ + --pool-window 7 \ + --common-egs-dir exp/chain/tdnn_v1_trial2_sp/egs +fi + +if [ $exp -eq 14 ]; then +dir_name=exp/chain/tdnn_v3_trial1_sp/ +mkdir -p $dir_name +for f in 0.trans_mdl cmvn_opts frame_subsampling_factor normalization.fst phone_lm.fst den.fst tree; do + cp exp/chain/tdnn_v1_trial1_sp/$f $dir_name +done + + local/chain/tdnn/run_tdnn_v3.sh --affix trial1 \ + --stage 12 \ + --train-stage -1 \ + --minibatch-size 64 \ + --pool-type 'per-dim-weighted-average' \ + --pool-window 7 \ + --common-egs-dir exp/chain/tdnn_v1_trial2_sp/egs +fi + +if [ $exp -eq 15 ]; then +dir_name=exp/chain/tdnn_v4_trial1_sp/ +mkdir -p $dir_name +for f in 0.trans_mdl cmvn_opts frame_subsampling_factor normalization.fst phone_lm.fst den.fst tree; do + cp exp/chain/tdnn_v1_trial1_sp/$f $dir_name +done + + local/chain/tdnn/run_tdnn_v4.sh --affix trial1 \ + --stage 12 \ + --train-stage 116 \ + --minibatch-size 64 \ + --pool-type 'per-dim-weighted-average' \ + --pool-window 7 \ + --common-egs-dir exp/chain/tdnn_v1_trial2_sp/egs +fi + +if [ $exp -eq 16 ]; then +dir_name=exp/chain/tdnn_v4_trial2_sp/ +mkdir -p $dir_name +for f in 0.trans_mdl cmvn_opts frame_subsampling_factor normalization.fst phone_lm.fst den.fst tree; do + cp exp/chain/tdnn_v1_trial1_sp/$f $dir_name +done + + local/chain/tdnn/run_tdnn_v4.sh --affix trial2 \ + --stage 12 \ + --train-stage -5 \ + --minibatch-size 64 \ + --pool-type 'per-dim-weighted-average' \ + --pool-window 7 \ + --self-repair-scale "" \ + --common-egs-dir exp/chain/tdnn_v1_trial2_sp/egs +fi + + +if [ $exp -eq 17 ]; then + # this is very similar to v3_trial1 as expected, so discontinuing this was + # similar to v4, except for HMM leaky coefficient reducing hmm leaky + # coefficient to 1e-5, brings the training progress back to before which + # causes a lot of undertraining + +dir_name=exp/chain/tdnn_v5_trial1_sp/ +mkdir -p $dir_name +for f in 0.trans_mdl cmvn_opts frame_subsampling_factor normalization.fst phone_lm.fst den.fst tree; do + cp exp/chain/tdnn_v1_trial1_sp/$f $dir_name +done + + local/chain/tdnn/run_tdnn_v5.sh --affix trial1 \ + --stage 12 \ + --train-stage -15 \ + --minibatch-size 64 \ + --pool-type 'per-dim-weighted-average' \ + --pool-window 7 \ + --common-egs-dir exp/chain/tdnn_v1_trial2_sp/egs +fi + + +if [ $exp -eq 18 ]; then +dir_name=exp/chain/tdnn_v5_mdwa_sp/ +mkdir -p $dir_name +for f in 0.trans_mdl cmvn_opts frame_subsampling_factor normalization.fst phone_lm.fst den.fst tree; do + cp exp/chain/tdnn_v1_trial1_sp/$f $dir_name +done + + local/chain/tdnn/run_tdnn_v5.sh --affix mdwa \ + --stage 12 \ + --train-stage 0 \ + --minibatch-size 64 \ + --pool-type 'multi-dim-weighted-average' \ + --pool-window 7 \ + --common-egs-dir exp/chain/tdnn_v1_trial2_sp/egs +fi + + +if [ $exp -eq 19 ]; then +dir_name=exp/chain/tdnn_v5_mdwa_sp/ +mkdir -p $dir_name +for f in 0.trans_mdl cmvn_opts frame_subsampling_factor normalization.fst phone_lm.fst den.fst tree; do + cp exp/chain/tdnn_v1_trial1_sp/$f $dir_name +done + + local/chain/tdnn/run_tdnn_v5.sh --affix mdwa \ + --stage 12 \ + --train-stage -15 \ + --minibatch-size 64 \ + --pool-type 'none' \ + --pool-window 7 \ + --common-egs-dir exp/chain/tdnn_v1_trial2_sp/egs +fi diff --git a/egs/swbd/s5c/total.perf b/egs/swbd/s5c/total.perf new file mode 100644 index 00000000000..112285c817a --- /dev/null +++ b/egs/swbd/s5c/total.perf @@ -0,0 +1,33 @@ +%WER 19.3 | 4459 42989 | 83.5 11.9 4.7 2.8 19.3 57.8 | exp/chain/tdnn_v_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +%WER 17.1 | 4459 42989 | 84.9 10.2 4.9 2.0 17.1 53.9 | exp/chain/tdnn_v1_trial1_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys +%WER 17.1 | 4459 42989 | 84.9 10.1 4.9 2.0 17.1 54.0 | exp/chain/tdnn_v1_trial5_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +%WER 17.1 | 4459 42989 | 84.7 10.0 5.3 1.9 17.1 54.1 | exp/chain/tdnn_v1_trial4_sp/decode_eval2000_sw1_fsh_fg/score_12_0.5/eval2000_hires.ctm.filt.sys +%WER 17.0 | 4459 42989 | 85.0 9.9 5.1 1.9 17.0 53.8 | exp/chain/tdnn_v1_trial3_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +%WER 16.9 | 4459 42989 | 84.8 9.4 5.8 1.6 16.9 53.5 | exp/chain/tdnn_v2_trial1_sp/decode_eval2000_200_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys +%WER 16.8 | 4459 42989 | 84.9 9.4 5.7 1.7 16.8 53.9 | exp/chain/tdnn_v2_trial5_sp/decode_eval2000_300_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys +%WER 16.7 | 4459 42989 | 85.2 9.8 5.0 1.9 16.7 52.9 | exp/chain/tdnn_v2_trial8_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys +%WER 16.7 | 4459 42989 | 85.2 9.8 5.0 1.9 16.7 52.9 | exp/chain/tdnn_v2_trial8_sp/decode_eval2000_400_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys +%WER 16.7 | 4459 42989 | 85.2 9.8 5.0 1.9 16.7 52.9 | exp/chain/tdnn_v1_trial8_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys +%WER 16.7 | 4459 42989 | 85.2 9.8 5.0 1.9 16.7 52.9 | exp/chain/tdnn_v1_trial8_sp/decode_eval2000_400_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys +%WER 16.7 | 4459 42989 | 85.2 9.6 5.2 1.9 16.7 53.7 | exp/chain/tdnn_v2_trial3_sp/decode_eval2000_400_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys +%WER 16.7 | 4459 42989 | 85.2 9.4 5.4 1.9 16.7 54.0 | exp/chain/tdnn_v2_trial5_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys +%WER 16.7 | 4459 42989 | 85.1 9.9 5.0 1.9 16.7 53.5 | exp/chain/tdnn_v2_trial1_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys +%WER 16.7 | 4459 42989 | 85.1 9.7 5.2 1.8 16.7 53.6 | exp/chain/tdnn_v2_trial1_sp/decode_eval2000_400_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys +%WER 16.6 | 4459 42989 | 85.4 10.0 4.7 2.0 16.6 53.7 | exp/chain/tdnn_v2_trial5_sp/decode_eval2000_400_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys +%WER 16.6 | 4459 42989 | 85.3 9.8 4.9 1.9 16.6 53.1 | exp/chain/tdnn_v2_trial4_sp/decode_eval2000_300_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys +%WER 16.6 | 4459 42989 | 85.2 9.7 5.2 1.8 16.6 53.7 | exp/chain/tdnn_v2_trial3_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys +%WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.3 | exp/chain/tdnn_v2_trial8_sp/decode_eval2000_300_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys +%WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.3 | exp/chain/tdnn_v1_trial8_sp/decode_eval2000_300_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys +%WER 16.6 | 4459 42989 | 85.2 9.6 5.2 1.8 16.6 53.3 | exp/chain/tdnn_v2_trial4_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys +%WER 16.6 | 4459 42989 | 85.2 9.5 5.3 1.8 16.6 53.4 | exp/chain/tdnn_v1_trial6_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +%WER 16.6 | 4459 42989 | 85.1 9.2 5.7 1.7 16.6 53.0 | exp/chain/tdnn_v2_trial8_sp/decode_eval2000_200_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys +%WER 16.6 | 4459 42989 | 85.1 9.2 5.7 1.7 16.6 53.0 | exp/chain/tdnn_v1_trial8_sp/decode_eval2000_200_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys +%WER 16.5 | 4459 42989 | 85.3 9.3 5.4 1.8 16.5 53.8 | exp/chain/tdnn_v2_trial3_sp/decode_eval2000_300_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys +%WER 16.5 | 4459 42989 | 85.2 9.5 5.3 1.7 16.5 53.6 | exp/chain/tdnn_v2_trial1_sp/decode_eval2000_300_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys +%WER 16.5 | 4459 42989 | 85.2 9.5 5.3 1.7 16.5 53.3 | exp/chain/tdnn_v2_trial4_sp/decode_eval2000_400_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys +%WER 16.4 | 4459 42989 | 85.5 9.6 5.0 1.9 16.4 53.3 | exp/chain/tdnn_v3_trial1_sp/decode_eval2000_400_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys +%WER 16.4 | 4459 42989 | 85.4 9.6 5.0 1.8 16.4 53.7 | exp/chain/tdnn_v2_trial2_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys +%WER 16.4 | 4459 42989 | 85.4 9.5 5.1 1.8 16.4 53.3 | exp/chain/tdnn_v3_trial1_sp/decode_eval2000_300_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys +%WER 16.3 | 4459 42989 | 85.6 9.7 4.7 1.9 16.3 53.2 | exp/chain/tdnn_v2_trial2_sp/decode_eval2000_300_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys +%WER 16.3 | 4459 42989 | 85.4 9.4 5.2 1.8 16.3 53.8 | exp/chain/tdnn_v2_trial2_sp/decode_eval2000_400_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys +%WER 16.1 | 4459 42989 | 85.7 9.5 4.7 1.9 16.1 52.7 | exp/chain/tdnn_v3_trial1_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys diff --git a/egs/tedlium/s5/RESULTS b/egs/tedlium/s5/RESULTS index 9c494712aa8..0c209bddf7e 100644 --- a/egs/tedlium/s5/RESULTS +++ b/egs/tedlium/s5/RESULTS @@ -7,6 +7,27 @@ for x in exp/*/decode*; do [ -d $x ] && grep WER $x/wer_* | utils/best_wer.sh; d for x in exp/{mono,tri,sgmm,nnet,dnn,lstm}*/decode*; do [ -d $x ] && grep Sum $x/score_*/*.sys | utils/best_wer.sh; done 2>/dev/null | grep $filter_regexp exit 0 + +#---------------------------------Current results (after fixing the problem)--------------------------------- +# There was a problem with the language model preparation where the scripts expected to represent OOV words while +# the language model used to represent them. See `git log tedlium-unk-fix` for details. +# Fixing this causes a small decrease in WER. + +# GMMs +# DEV SPEAKERS: +%WER 31.0 | 507 17792 | 73.5 20.2 6.3 4.5 31.0 97.2 | -0.032 | exp/tri1/decode_nosp_dev/score_11_0.0/ctm.filt.filt.sys +%WER 26.4 | 507 17792 | 77.8 16.7 5.5 4.2 26.4 95.5 | -0.066 | exp/tri2/decode_nosp_dev/score_13_0.0/ctm.filt.filt.sys +%WER 26.1 | 507 17792 | 77.2 16.3 6.5 3.4 26.1 95.5 | -0.106 | exp/tri2/decode_dev/score_14_1.0/ctm.filt.filt.sys +%WER 22.0 | 507 17792 | 81.6 13.2 5.2 3.6 22.0 93.9 | -0.189 | exp/tri3/decode_dev/score_13_1.0/ctm.filt.filt.sys + +# TEST SPEAKERS: +%WER 30.9 | 1155 27512 | 72.1 21.0 6.9 3.0 30.9 94.5 | 0.035 | exp/tri1/decode_nosp_test/score_12_0.5/ctm.filt.filt.sys +%WER 25.5 | 1155 27512 | 78.0 17.4 4.6 3.6 25.5 92.8 | -0.034 | exp/tri2/decode_nosp_test/score_12_0.0/ctm.filt.filt.sys +%WER 24.9 | 1155 27512 | 78.3 16.7 5.0 3.2 24.9 93.0 | -0.020 | exp/tri2/decode_test/score_14_0.5/ctm.filt.filt.sys +%WER 20.3 | 1155 27512 | 82.7 13.4 3.9 3.0 20.3 90.0 | -0.063 | exp/tri3/decode_test/score_14_0.5/ctm.filt.filt.sys + +#---------------------------------(Pre- fix for Cantab LM) Provided for reference---------------------------------- + # Results from Nikolay, using kaldi scoring: # %WER 35.17 [ 9677 / 27512, 1267 ins, 1681 del, 6729 sub ] exp/tri1/decode/wer_13 # %WER 30.03 [ 8262 / 27512, 1255 ins, 1367 del, 5640 sub ] exp/tri2/decode/wer_15 diff --git a/egs/tedlium/s5/cmd.sh b/egs/tedlium/s5/cmd.sh index bed97d34020..ba7f120e599 100644 --- a/egs/tedlium/s5/cmd.sh +++ b/egs/tedlium/s5/cmd.sh @@ -19,7 +19,7 @@ host=$(hostname -f) if [ ${host#*.} == "fit.vutbr.cz" ]; then # BUT cluster: queue="all.q@@blade,all.q@@speech" - gpu_queue="long.q@supergpu*,long.q@dellgpu*,long.q@pcspeech-gpu,long.q@pcgpu*" + gpu_queue="long.q@@gpu" storage="matylda5" export train_cmd="queue.pl -q $queue -l ram_free=1500M,mem_free=1500M,${storage}=1" export decode_cmd="queue.pl -q $queue -l ram_free=2500M,mem_free=2500M,${storage}=0.5" diff --git a/egs/tedlium/s5/local/chain/README b/egs/tedlium/s5/local/chain/README new file mode 100644 index 00000000000..85e5b863a7c --- /dev/null +++ b/egs/tedlium/s5/local/chain/README @@ -0,0 +1,16 @@ +These are the instructions to reproduce the TEDLIUM models described in +"Purely sequence-trained neural networks for ASR based on lattice-free +MMI", by Povey et al. + +First run: + +./run.sh + +until the end of stage 7. (local/nnet/run_dnn.sh can be skipped.) + +Then run: + +local/chain/run_tdnn.sh + +to see results for a generic chain model. See the script's header +comments to see other options, and their results. \ No newline at end of file diff --git a/egs/tedlium/s5/local/chain/run_tdnn.sh b/egs/tedlium/s5/local/chain/run_tdnn.sh new file mode 100755 index 00000000000..804bf93f58a --- /dev/null +++ b/egs/tedlium/s5/local/chain/run_tdnn.sh @@ -0,0 +1,204 @@ +#!/bin/bash +# +# This script requires that you have run the toplevel run.sh script in TEDLIUM up to stage 7. +# +# Results: (Run for x in exp/chain/tdnn/decode*; do [ -d $x ] && grep Sum $x/score_*/*.sys | utils/best_wer.sh; done 2>/dev/null) +## Number of parameters: 6172530 +## %WER 14.1 | 507 17792 | 88.6 7.3 4.1 2.7 14.1 92.9 | 0.075 | exp/chain/tdnn/decode_dev/score_10_0.5/ctm.filt.filt.sys +## %WER 13.3 | 507 17792 | 89.7 6.9 3.4 2.9 13.3 92.1 | 0.000 | exp/chain/tdnn/decode_dev_rescore/score_10_0.0/ctm.filt.filt.sys +## %WER 13.8 | 1155 27512 | 89.4 7.5 3.1 3.2 13.8 87.9 | 0.101 | exp/chain/tdnn/decode_test/score_10_0.0/ctm.filt.filt.sys +## %WER 12.9 | 1155 27512 | 90.1 6.6 3.3 2.9 12.9 86.1 | 0.043 | exp/chain/tdnn/decode_test_rescore/score_10_0.0/ctm.filt.filt.sys +# The final WER (rescored WER on the test set) is what we are interested in. + +# To reproduce the setup used in the paper, set the following variables: +# affix=_more_ce +# relu_dim=525 +# xent_regularize=0.2 +# +# Results: (Run for x in exp/chain/tdnn_more_ce/decode*; do [ -d $x ] && grep Sum $x/score_*/*.sys | utils/best_wer.sh; done 2>/dev/null) +## Number of parameters: 8758742 +## %WER 14.3 | 507 17792 | 89.0 7.8 3.2 3.3 14.3 93.5 | 0.116 | exp/chain/tdnn_more_ce/decode_dev/score_10_0.0/ctm.filt.filt.sys +## %WER 13.0 | 507 17792 | 90.0 6.9 3.2 2.9 13.0 91.3 | -0.003 | exp/chain/tdnn_more_ce/decode_devv_rescore/score_10_0.0/ctm.filt.filt.sys +## %WER 13.8 | 1155 27512 | 89.1 7.4 3.4 2.9 13.8 87.5 | 0.082 | exp/chain/tdnn_more_ce/decode_test/score_10_0.5/ctm.filt.filt.sys +## %WER 12.8 | 1155 27512 | 90.4 6.6 3.1 3.1 12.8 86.7 | 0.014 | exp/chain/tdnn_more_ce/decode_test_rescore/score_10_0.0/ctm.filt.filt.sys + +set -uo pipefail + +# configs for 'chain' +affix= +stage=0 # After running the entire script once, you can set stage=12 to tune the neural net only. +train_stage=-10 +get_egs_stage=-10 +dir=exp/chain/tdnn +decode_iter= + +# TDNN options +# this script uses the new tdnn config generator so it needs a final 0 to reflect that the final layer input has no splicing +self_repair_scale=0.00001 +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=2.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=8 +minibatch_size=128 +relu_dim=425 +frames_per_eg=150 +remove_egs=false +xent_regularize=0.1 + + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +dir=${dir}${affix} + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 4000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + echo "$0: creating neural net configs"; + + # create the config files for nnet initialization + repair_opts=${self_repair_scale:+" --self-repair-scale $self_repair_scale "} + + steps/nnet3/tdnn/make_configs.py \ + $repair_opts \ + --feat-dir data/train_sp_hires \ + --ivector-dir exp/nnet3/ivectors_train_sp \ + --tree-dir $treedir \ + --relu-dim $relu_dim \ + --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3 -3,0,3 -6,-3,0 0" \ + --use-presoftmax-prior-scale false \ + --xent-regularize $xent_regularize \ + --xent-separate-forward-affine true \ + --include-log-softmax false \ + --final-layer-normalize-target $final_layer_normalize_target \ + $dir/configs || exit 1; +fi + +if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]]; then + # spread the egs over various machines. will help reduce overload of any + # one machine. + utils/create_split_dir.pl /export/b{09,10,11,12}/$USER/kaldi-data/egs/tedlium-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete + + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir exp/nnet3/ivectors_train_sp \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.stage $get_egs_stage \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width $frames_per_eg \ + --trainer.num-chunk-per-minibatch $minibatch_size \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.num-jobs-initial $num_jobs_initial \ + --trainer.optimization.num-jobs-final $num_jobs_final \ + --trainer.optimization.initial-effective-lrate $initial_effective_lrate \ + --trainer.optimization.final-effective-lrate $final_effective_lrate \ + --trainer.max-param-change $max_param_change \ + --cleanup.remove-egs $remove_egs \ + --cleanup.preserve-model-interval 20 \ + --feat-dir data/train_sp_hires \ + --tree-dir $treedir \ + --lat-dir $lats_dir \ + --dir $dir || exit 1; +fi + +if [ $stage -le 14 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $dir/graph +fi + +graph_dir=$dir/graph +if [ $stage -le 15 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + iter_opts=" --iter $decode_iter " + fi + + for decode_set in dev test; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $(wc -l data/$decode_set/spk2utt) --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + --scoring-opts "--min_lmwt 5 --max_lmwt 15" \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter} || exit 1; + + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_test data/lang_rescore data/${decode_set}_hires \ + $dir/decode_${decode_set}${decode_iter:+_$decode_iter} \ + $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_rescore || exit 1; + ) & + done +fi + +wait \ No newline at end of file diff --git a/egs/tedlium/s5/local/confidence_calibration.sh b/egs/tedlium/s5/local/confidence_calibration.sh new file mode 100755 index 00000000000..0eb3dc21521 --- /dev/null +++ b/egs/tedlium/s5/local/confidence_calibration.sh @@ -0,0 +1,81 @@ +#!/bin/bash +. cmd.sh +. path.sh + +# Global options, +graph=exp/tri3/graph +arpa_gz=db/cantab-TEDLIUM/cantab-TEDLIUM-pruned.lm3.gz +lmwt=13 + +# Dev-set options, +dev_data=data/dev +dev_latdir=exp/tri3_mmi_b0.1/decode_dev_it4 + +# Eval-set options, +eval_data=data/test +eval_latdir=exp/tri3_mmi_b0.1/decode_test_it4 + +. utils/parse_options.sh +set -euxo pipefail + +# Derived options, +dev_caldir=$dev_latdir/confidence_$lmwt +eval_caldir=$eval_latdir/confidence_$lmwt + +###### Data preparation, + +# Prepare filtering for excluding data from train-set (1 .. keep word, 0 .. exclude word), +# - only excludes from training-targets, the confidences are recalibrated for all the words, +word_filter=$(mktemp) +awk '{ keep_the_word = $1 !~ /^(\[.*\]|<.*>|%.*|!.*|-.*|.*-)$/; print $0, keep_the_word }' \ + $graph/words.txt >$word_filter + +# Calcualte the word-length, +word_length=$(mktemp) +awk '{if(r==0) { len_hash[$1] = NF-2; } + if(r==1) { if(len_hash[$1]) { len = len_hash[$1]; } else { len = -1 } + print $0, len; }}' \ + r=0 $graph/phones/align_lexicon.txt \ + r=1 $graph/words.txt \ + >$word_length + +# Extract unigrams, +unigrams=$(mktemp); steps/conf/parse_arpa_unigrams.py $graph/words.txt $arpa_gz $unigrams + +###### Paste the 'word-specific' features (first 4 columns have fixed position, more feature-columns can be added), +# Format: "word word_id filter length other_features" +word_feats=$(mktemp) +paste $word_filter <(awk '{ print $3 }' $word_length) <(awk '{ print $3 }' $unigrams) > $word_feats + + +###### Train the calibration, +steps/conf/train_calibration.sh --cmd "$decode_cmd" --lmwt $lmwt \ + $dev_data $graph $word_feats $dev_latdir $dev_caldir + +###### Apply the calibration to eval set, +steps/conf/apply_calibration.sh --cmd "$decode_cmd" \ + $eval_data $graph $eval_latdir $dev_caldir $eval_caldir +# The final confidences are here '$eval_caldir/ctm_calibrated', + +###### Sclite scoring, +# We will produce NCE which shows the ``quality'' of the confidences. +# Please compare with the default scoring script for your database. + +# Scoring tools, +hubscr=$KALDI_ROOT/tools/sctk/bin/hubscr.pl +hubdir=`dirname $hubscr` + +# Inputs, +ctm=$eval_caldir/ctm_calibrated +stm=$eval_data/stm +glm=$eval_data/glm + +# Normalizng CTM, just like in 'local/score_sclite.sh', +cat $ctm | grep -v -E '\[BREATH|NOISE|COUGH|SMACK|UM|UH\]' | \ + grep -v -E '"!SIL|\' >${ctm}.filt + +# Mapping the time info to global, +utils/convert_ctm.pl $eval_data/segments $eval_data/reco2file_and_channel <${ctm}.filt >${ctm}.filt.conv + +# Scoring, +$hubscr -p $hubdir -V -l english -h hub5 -g $glm -r $stm ${ctm}.filt.conv diff --git a/egs/tedlium/s5/local/join_suffix.py b/egs/tedlium/s5/local/join_suffix.py index 25b097ed0e4..55cc9ba37ac 100755 --- a/egs/tedlium/s5/local/join_suffix.py +++ b/egs/tedlium/s5/local/join_suffix.py @@ -5,9 +5,10 @@ import sys +from codecs import open words = set() -for line in open(sys.argv[1]): +for line in open(sys.argv[1], encoding='utf8'): items = line.split() words.add(items[0]) @@ -16,12 +17,10 @@ new_items = [] i = 1 while i < len(items): - if i < len(items) - 1 and items[i+1][0] == '\'' and items[i] + items[i+1] in words: - new_items.append(items[i] + items[i+1]) - i = i + 1 - else: - new_items.append(items[i]) - i = i + 1 - - print items[0], " ".join(new_items) - + if i < len(items) - 1 and items[i+1][0] == '\'' and items[i] + items[i+1] in words: + new_items.append(items[i] + items[i+1]) + i = i + 1 + else: + new_items.append(items[i]) + i = i + 1 + print(items[0] + ' ' + ' '.join(new_items)) diff --git a/egs/tedlium/s5/local/nnet/run_dnn_bn.sh b/egs/tedlium/s5/local/nnet/run_dnn_bn.sh index 909d1b2f253..3bd0dc2a1ea 100755 --- a/egs/tedlium/s5/local/nnet/run_dnn_bn.sh +++ b/egs/tedlium/s5/local/nnet/run_dnn_bn.sh @@ -146,7 +146,7 @@ if [ $stage -le 5 ]; then --transform-dir $gmm/decode_$(basename $test_bn) \ $test_bn_fmllr $test_bn $gmm $test_bn_fmllr/log $test_bn_fmllr/data || exit 1; # Training set - steps/nnet/make_fmllr_feats.sh --nj $njfea --cmd "$train_cmd -tc 10" \ + steps/nnet/make_fmllr_feats.sh --nj $njfea --cmd "$train_cmd --max-jobs-run 10" \ --transform-dir ${gmm}_ali \ $train_bn_fmllr $train_bn $gmm $train_bn_fmllr/log $train_bn_fmllr/data || exit 1; # Split the training set diff --git a/egs/tedlium/s5/local/nnet/run_dnn_fbank.sh b/egs/tedlium/s5/local/nnet/run_dnn_fbank.sh index 6403db12f3e..762b8a71307 100755 --- a/egs/tedlium/s5/local/nnet/run_dnn_fbank.sh +++ b/egs/tedlium/s5/local/nnet/run_dnn_fbank.sh @@ -40,7 +40,7 @@ stage=0 steps/compute_cmvn_stats.sh $test $test/log $test/data || exit 1; # Training set utils/copy_data_dir.sh $train_original $train || exit 1; rm $train/{cmvn,feats}.scp - steps/make_fbank_pitch.sh --nj 10 --cmd "$train_cmd -tc 10" \ + steps/make_fbank_pitch.sh --nj 10 --cmd "$train_cmd --max-jobs-run 10" \ $train $train/log $train/data || exit 1; steps/compute_cmvn_stats.sh $train $train/log $train/data || exit 1; # Split the training set diff --git a/egs/tedlium/s5/local/nnet/run_lstm.sh b/egs/tedlium/s5/local/nnet/run_lstm.sh index 3293724cfb3..a8d6326812e 100755 --- a/egs/tedlium/s5/local/nnet/run_lstm.sh +++ b/egs/tedlium/s5/local/nnet/run_lstm.sh @@ -29,7 +29,7 @@ stage=0 steps/compute_cmvn_stats.sh $dev $dev/log $dev/data || exit 1; # Training set utils/copy_data_dir.sh $train_original $train || exit 1; rm $train/{cmvn,feats}.scp - steps/make_fbank_pitch.sh --nj 10 --cmd "$train_cmd -tc 10" \ + steps/make_fbank_pitch.sh --nj 10 --cmd "$train_cmd --max-jobs-run 10" \ $train $train/log $train/data || exit 1; steps/compute_cmvn_stats.sh $train $train/log $train/data || exit 1; # Split the training set @@ -46,7 +46,7 @@ if [ $stage -le 1 ]; then steps/nnet/train.sh --network-type lstm --learn-rate 0.00001 \ --cmvn-opts "--norm-means=true --norm-vars=true" --feat-type plain --splice 0 \ --proto-opts "--clip-gradient 5.0" \ - --train-opts "--momentum 0.9 --halving-factor 0.65" \ + --train-tool-opts "--momentum 0.9 --halving-factor 0.65" \ --train-tool "nnet-train-lstm-streams --num-stream=4 --targets-delay=5" \ ${train}_tr90 ${train}_cv10 data/lang $ali $ali $dir || exit 1; diff --git a/egs/tedlium/s5/local/nnet3/README b/egs/tedlium/s5/local/nnet3/README new file mode 100644 index 00000000000..6b77eb121b8 --- /dev/null +++ b/egs/tedlium/s5/local/nnet3/README @@ -0,0 +1,9 @@ +To produce the results from: + +"Purely sequence-trained neural networks for ASR based on lattice-free MMI", Povey et al. + +Run the following in order: + +./run.sh +local/nnet3/run_tdnn.sh +local/nnet3/run_tdnn_discriminative.sh \ No newline at end of file diff --git a/egs/tedlium/s5/local/nnet3/run_ivector_common.sh b/egs/tedlium/s5/local/nnet3/run_ivector_common.sh new file mode 100755 index 00000000000..0b1738a2e8e --- /dev/null +++ b/egs/tedlium/s5/local/nnet3/run_ivector_common.sh @@ -0,0 +1,125 @@ +#!/bin/bash + +# This is based on: +# swbd/s5c/local/nnet3/run_ivector_common.sh and +# tedlium/s5/local/online/run_nnet2_ms_perturbed.sh +# see the chain docs for general direction on what training is doing! + +set -uo pipefail +stage=1 +generate_alignments=true # false if doing ctc training + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +mkdir -p exp/nnet3 +# perturb the data +train_set=train +if [ $stage -le 1 ]; then + #Although the nnet will be trained by high resolution data, we still have to perturb the normal data to get the alignment + + utils/perturb_data_dir_speed.sh 0.9 data/${train_set} data/temp1 + utils/perturb_data_dir_speed.sh 1.1 data/${train_set} data/temp2 + utils/combine_data.sh data/${train_set}_tmp data/temp1 data/temp2 + utils/validate_data_dir.sh --no-feats data/${train_set}_tmp + rm -r data/temp1 data/temp2 + + mfccdir=mfcc_perturbed + steps/make_mfcc.sh --cmd "$train_cmd" --nj 50 \ + data/${train_set}_tmp exp/make_mfcc/${train_set}_tmp $mfccdir || exit 1; + steps/compute_cmvn_stats.sh data/${train_set}_tmp exp/make_mfcc/${train_set}_tmp $mfccdir || exit1; + utils/fix_data_dir.sh data/${train_set}_tmp + + utils/copy_data_dir.sh --spk-prefix sp1.0- --utt-prefix sp1.0- data/${train_set} data/temp0 + utils/combine_data.sh data/${train_set}_sp data/${train_set}_tmp data/temp0 + utils/fix_data_dir.sh data/${train_set}_sp + rm -r data/temp0 data/${train_set}_tmp +fi + +train_set_sp=${train_set}_sp + +if [ $stage -le 2 ] && [ "$generate_alignments" == "true" ]; then + # obtain the alignment of the pertubed data + steps/align_fmllr.sh --nj 100 --cmd "$train_cmd" \ + data/${train_set_sp} data/lang_nosp exp/tri3 exp/tri3_ali_sp || exit 1 +fi + +if [ $stage -le 3 ]; then + + mfccdir=mfcc_hires + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $mfccdir/storage ]; then + date=$(date +'%m_%d_%H_%M') + utils/create_split_dir.pl /export/b{09,10,11,12}/$USER/kaldi-data/egs/tedlium-$date/s5/$mfccdir/storage $mfccdir/storage + fi + + for dataset in $train_set $train_set_sp; do + data_dir=data/${dataset}_hires + utils/copy_data_dir.sh data/$dataset $data_dir + + # this next section does volume perturbation on the data. + cat $data_dir/wav.scp | python -c " +import sys, os, subprocess, re, random +random.seed(0) +scale_low = 1.0/8 +scale_high = 2.0 +for line in sys.stdin.readlines(): + if len(line.strip()) == 0: + continue + print '{0} sox --vol {1} -t wav - -t wav - |'.format(line.strip(), random.uniform(scale_low, scale_high)) +"| sort -k1,1 -u > $data_dir/wav.scp_scaled || exit 1; + mv $data_dir/wav.scp_scaled $data_dir/wav.scp + + steps/make_mfcc.sh --nj 70 --mfcc-config conf/mfcc_hires.conf \ + $data_dir exp/make_hires/$dataset $mfccdir + steps/compute_cmvn_stats.sh $data_dir exp/make_hires/$dataset $mfccdir + utils/fix_data_dir.sh $data_dir # remove segments with problems + done + + for dataset in dev test; do + data_dir=data/${dataset}_hires + utils/copy_data_dir.sh data/$dataset $data_dir + + steps/make_mfcc.sh --nj 70 --mfcc-config conf/mfcc_hires.conf \ + $data_dir exp/make_hires/$dataset $mfccdir + steps/compute_cmvn_stats.sh $data_dir exp/make_hires/$dataset $mfccdir + utils/fix_data_dir.sh $data_dir # remove segments with problems + done +fi + +# ivector extractor training +if [ $stage -le 5 ]; then + # We need to build a small system just because we need the LDA+MLLT transform + # to train the diag-UBM on top of. We use --num-iters 13 because after we get + # the transform (12th iter is the last), any further training is pointless. + # this decision is based on fisher_english + # Note: We do NOT use speed-perturbed data in this step. + steps/train_lda_mllt.sh --cmd "$train_cmd" --num-iters 13 \ + --splice-opts "--left-context=3 --right-context=3" \ + 5000 10000 data/${train_set}_hires \ + data/lang_nosp exp/tri3_ali exp/nnet3/tri3b +fi + +if [ $stage -le 6 ]; then + steps/online/nnet2/train_diag_ubm.sh --cmd "$train_cmd" --nj 30 --num-frames 700000 \ + data/${train_set_sp}_hires 512 exp/nnet3/tri3b exp/nnet3/diag_ubm +fi + +if [ $stage -le 7 ]; then + steps/online/nnet2/train_ivector_extractor.sh --cmd "$train_cmd" --nj 10 \ + data/${train_set_sp}_hires exp/nnet3/diag_ubm exp/nnet3/extractor || exit 1; +fi + +if [ $stage -le 8 ]; then + + steps/online/nnet2/copy_data_dir.sh --utts-per-spk-max 2 data/${train_set_sp}_hires \ + data/${train_set_sp}_hires_max2 + + steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 30 \ + data/${train_set_sp}_hires_max2 exp/nnet3/extractor exp/nnet3/ivectors_${train_set_sp} || exit 1 + + for data_set in dev test; do + steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 8 \ + data/${data_set}_hires exp/nnet3/extractor exp/nnet3/ivectors_${data_set} || exit 1; + done +fi diff --git a/egs/tedlium/s5/local/nnet3/run_tdnn.sh b/egs/tedlium/s5/local/nnet3/run_tdnn.sh new file mode 100755 index 00000000000..4eabd9fae0b --- /dev/null +++ b/egs/tedlium/s5/local/nnet3/run_tdnn.sh @@ -0,0 +1,113 @@ +#!/bin/bash + +# this is the standard "tdnn" system, built in nnet3; it's what we use to +# call multi-splice. + +# Results (2 epochs): +# Number of parameters: 6056880 +# %WER 15.3 | 507 17792 | 87.4 9.0 3.6 2.7 15.3 90.1 | -0.081 | exp/nnet3/tdnn_sp/decode_dev/score_10_0.5/ctm.filt.filt.sys +# %WER 13.9 | 507 17792 | 88.4 8.0 3.6 2.3 13.9 85.8 | -0.164 | exp/nnet3/tdnn_sp/decode_dev_rescore/score_10_0.5/ctm.filt.filt.sys +# %WER 13.8 | 1155 27512 | 88.5 8.7 2.7 2.3 13.8 84.2 | -0.076 | exp/nnet3/tdnn_sp/decode_test/score_10_0.0/ctm.filt.filt.sys +# %WER 12.5 | 1155 27512 | 89.6 7.7 2.6 2.1 12.5 81.5 | -0.133 | exp/nnet3/tdnn_sp/decode_test_rescore/score_10_0.0/ctm.filt.filt.sys + +# 4 epochs +# %WER 14.6 | 507 17792 | 87.9 8.7 3.4 2.5 14.6 88.6 | -0.111 | exp/nnet3/tdnn/decode_dev/score_10_0.5/ctm.filt.filt.sys +# %WER 13.2 | 507 17792 | 89.4 7.7 2.9 2.6 13.2 85.0 | -0.170 | exp/nnet3/tdnn/decode_dev_rescore/score_10_0.0/ctm.filt.filt.sys +# %WER 13.5 | 1155 27512 | 88.7 8.5 2.7 2.3 13.5 83.6 | -0.110 | exp/nnet3/tdnn/decode_test/score_10_0.0/ctm.filt.filt.sys +# %WER 12.1 | 1155 27512 | 89.9 7.5 2.6 2.1 12.1 80.3 | -0.178 | exp/nnet3/tdnn/decode_test_rescore/score_10_0.0/ctm.filt.filt.sys + +# At this script level we don't support not running on GPU, as it would be painfully slow. +# If you want to run without GPU you'd have to call train_tdnn.sh with --gpu false, +# --num-threads 16 and --minibatch-size 128. + +stage=1 +affix= +train_stage=-10 +common_egs_dir= +reporting_email= +remove_egs=true +decode_iter= + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + + +if ! cuda-compiled; then + cat <" | grep -v "" | LANG= LC_ALL= sort | sed 's:([0-9])::g' > $dir/lexicon_words.txt +cat $srcdict | grep -v -w "" | grep -v -w "" | grep -v -w "" | \ + LANG= LC_ALL= sort | sed 's:([0-9])::g' > $dir/lexicon_words.txt cat $dir/lexicon_words.txt | awk '{ for(n=2;n<=NF;n++){ phones[$n] = 1; }} END{for (p in phones) print p;}' | \ - grep -v SIL | sort > $dir/nonsilence_phones.txt + grep -v SIL | sort > $dir/nonsilence_phones.txt ( echo SIL; echo BRH; echo CGH; echo NSN ; echo SMK; echo UM; echo UHH ) > $dir/silence_phones.txt @@ -27,9 +29,11 @@ echo SIL > $dir/optional_silence.txt echo -n >$dir/extra_questions.txt # Add to the lexicon the silences, noises etc. +# Typically, you would use " NSN" here, but the Cantab Research language models +# use instead of to represent out of vocabulary words. (echo '!SIL SIL'; echo '[BREATH] BRH'; echo '[NOISE] NSN'; echo '[COUGH] CGH'; echo '[SMACK] SMK'; echo '[UM] UM'; echo '[UH] UHH' - echo ' NSN' ) | \ + echo ' NSN' ) | \ cat - $dir/lexicon_words.txt | sort | uniq > $dir/lexicon.txt # Check that the dict dir is okay! diff --git a/egs/tedlium/s5/local/prepare_lm.sh b/egs/tedlium/s5/local/prepare_lm.sh index 21e92704e23..e1efe628483 100755 --- a/egs/tedlium/s5/local/prepare_lm.sh +++ b/egs/tedlium/s5/local/prepare_lm.sh @@ -1,6 +1,6 @@ -#!/bin/bash +#!/bin/bash # -# Copyright 2014 Nickolay V. Shmyrev +# Copyright 2014 Nickolay V. Shmyrev # Apache 2.0 @@ -12,21 +12,8 @@ arpa_lm=db/cantab-TEDLIUM/cantab-TEDLIUM-pruned.lm3.gz rm -rf data/lang_nosp_test cp -r data/lang_nosp data/lang_nosp_test -# grep -v ' ' etc. is only for future-proofing this script. Our -# LM doesn't have these "invalid combinations". These can cause -# determinization failures of CLG [ends up being epsilon cycles]. -# Note: remove_oovs.pl takes a list of words in the LM that aren't in -# our word list. Since our LM doesn't have any, we just give it -# /dev/null [we leave it in the script to show how you'd do it]. -gunzip -c "$arpa_lm" | \ - grep -v ' ' | \ - grep -v ' ' | \ - grep -v ' ' | \ - arpa2fst - | fstprint | \ - utils/remove_oovs.pl /dev/null | \ - utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=data/lang_nosp_test/words.txt \ - --osymbols=data/lang_nosp_test/words.txt --keep_isymbols=false --keep_osymbols=false | \ - fstrmepsilon | fstarcsort --sort_type=ilabel > data/lang_nosp_test/G.fst +gunzip -c "$arpa_lm" | arpa2fst --disambig-symbol=#0 \ + --read-symbol-table=data/lang_nosp_test/words.txt - data/lang_nosp_test/G.fst echo "Checking how stochastic G is (the first of these numbers should be small):" diff --git a/egs/tedlium/s5/local/score_sclite.sh b/egs/tedlium/s5/local/score_sclite.sh index 518ba040659..7b0915abea4 100755 --- a/egs/tedlium/s5/local/score_sclite.sh +++ b/egs/tedlium/s5/local/score_sclite.sh @@ -13,6 +13,7 @@ beam=7 # speed-up, but may affect MBR confidences. word_ins_penalty=0.0,0.5,1.0 min_lmwt=10 max_lmwt=20 +iter=final #end configuration section. [ -f ./path.sh ] && . ./path.sh @@ -32,7 +33,7 @@ data=$1 lang=$2 # Note: may be graph directory not lang directory, but has the necessary stuff copied. dir=$3 -model=$dir/../final.mdl # assume model one level up from decoding dir. +model=$dir/../$iter.mdl # assume model one level up from decoding dir. hubscr=$KALDI_ROOT/tools/sctk/bin/hubscr.pl [ ! -f $hubscr ] && echo "Cannot find scoring program at $hubscr" && exit 1; @@ -48,6 +49,15 @@ nj=$(cat $dir/num_jobs) mkdir -p $dir/scoring/log +if [ -f $dir/../frame_shift ]; then + frame_shift_opt="--frame-shift=$(cat $dir/../frame_shift)" + echo "$0: $dir/../frame_shift exists, using $frame_shift_opt" +elif [ -f $dir/../frame_subsampling_factor ]; then + factor=$(cat $dir/../frame_subsampling_factor) || exit 1 + frame_shift_opt="--frame-shift=0.0$factor" + echo "$0: $dir/../frame_subsampling_factor exists, using $frame_shift_opt" +fi + if [ $stage -le 0 ]; then for wip in $(echo $word_ins_penalty | sed 's/,/ /g'); do $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/get_ctm.LMWT.${wip}.log \ @@ -58,7 +68,7 @@ if [ $stage -le 0 ]; then lattice-prune --beam=$beam ark:- ark:- \| \ lattice-align-words-lexicon --output-error-lats=true --max-expand=10.0 --test=false \ $lang/phones/align_lexicon.int $model ark:- ark:- \| \ - lattice-to-ctm-conf --decode-mbr=$decode_mbr ark:- - \| \ + lattice-to-ctm-conf --decode-mbr=$decode_mbr $frame_shift_opt ark:- - \| \ utils/int2sym.pl -f 5 $lang/words.txt \| \ utils/convert_ctm.pl $data/segments $data/reco2file_and_channel \| \ sort -k1,1 -k2,2 -k3,3nb '>' $dir/score_LMWT_${wip}/ctm || exit 1; @@ -68,8 +78,10 @@ fi if [ $stage -le 1 ]; then # Remove some stuff we don't want to score, from the ctm. for x in $dir/score_*/ctm; do - cat $x | grep -v -E '"\[BREATH|NOISE|COUGH|SMACK|UM|UH\]"' | \ - grep -v -E '"!SIL|\"' > ${x}.filt || exit 1; + # `-i` is not needed in the following. It is added for robustness in ase this code is copy-pasted + # into another script that, e.g., uses instead of + cat $x | grep -v -w -i -E '\[BREATH|NOISE|COUGH|SMACK|UM|UH\]' | \ + grep -v -w -i -E '!SIL|' > ${x}.filt || exit 1; done fi diff --git a/egs/tedlium/s5/path.sh b/egs/tedlium/s5/path.sh index dcefaea23d8..16d5314b9c2 100755 --- a/egs/tedlium/s5/path.sh +++ b/egs/tedlium/s5/path.sh @@ -1,3 +1,6 @@ export KALDI_ROOT=`pwd`/../../.. -export PATH=$PWD/utils/:$KALDI_ROOT/src/bin:$KALDI_ROOT/tools/openfst/bin:$KALDI_ROOT/src/fstbin/:$KALDI_ROOT/src/gmmbin/:$KALDI_ROOT/src/featbin/:$KALDI_ROOT/src/lm/:$KALDI_ROOT/src/sgmmbin/:$KALDI_ROOT/src/sgmm2bin/:$KALDI_ROOT/src/fgmmbin/:$KALDI_ROOT/src/latbin/:$KALDI_ROOT/src/nnetbin:$KALDI_ROOT/src/nnet2bin:$KALDI_ROOT/src/online2bin/:$KALDI_ROOT/src/ivectorbin/:$KALDI_ROOT/src/lmbin/:$PWD:$PATH:$KALDI_ROOT/tools/sph2pipe_v2.5 +[ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh +export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH:$KALDI_ROOT/tools/sph2pipe_v2.5 +[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1 +. $KALDI_ROOT/tools/config/common_path.sh export LC_ALL=C diff --git a/egs/tedlium/s5/run.sh b/egs/tedlium/s5/run.sh index 7a36e49e8e0..e1dbf7b80e0 100755 --- a/egs/tedlium/s5/run.sh +++ b/egs/tedlium/s5/run.sh @@ -9,7 +9,7 @@ # The data is distributed under 'Creative Commons BY-NC-ND 3.0' license, # which allow free non-commercial use, while only a citation is required. # -# Copyright 2014 Nickolay V. Shmyrev +# Copyright 2014 Nickolay V. Shmyrev # 2014 Brno University of Technology (Author: Karel Vesely) # Apache 2.0 # @@ -28,17 +28,18 @@ stage=0 # Data preparation if [ $stage -le 0 ]; then local/download_data.sh || exit 1 - + local/prepare_data.sh || exit 1 local/prepare_dict.sh || exit 1 utils/prepare_lang.sh data/local/dict_nosp \ - "" data/local/lang_nosp data/lang_nosp || exit 1 + "" data/local/lang_nosp data/lang_nosp || exit 1 local/prepare_lm.sh || exit 1 fi + # Feature extraction feat_dir=$pwd/data/mfcc_features if [ $stage -le 1 ]; then @@ -100,7 +101,7 @@ if [ $stage -le 5 ]; then data/local/dict_nosp exp/tri2/pron_counts_nowb.txt \ exp/tri2/sil_counts_nowb.txt \ exp/tri2/pron_bigram_counts_nowb.txt data/local/dict - + utils/prepare_lang.sh data/local/dict "" data/local/lang data/lang cp -rT data/lang data/lang_test cp -rT data/lang data/lang_rescore @@ -134,6 +135,8 @@ if [ $stage -le 6 ]; then exp/tri3/graph data/test exp/tri3/decode_test || exit 1 fi +# steps/cleanup/debug_lexicon.sh --nj 100 --alidir exp/tri3 --cmd "$train_cmd" data/train data/lang exp/tri3 data/local/dict/lexicon.txt exp/tri3_debug_lexicon & + if [ $stage -le 7 ]; then steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \ data/train data/lang exp/tri3 exp/tri3_ali || exit 1 diff --git a/egs/thchs30/README.txt b/egs/thchs30/README.txt new file mode 100644 index 00000000000..acbdea4a263 --- /dev/null +++ b/egs/thchs30/README.txt @@ -0,0 +1,10 @@ +THCHS30 is an open Chinese speech database published by Center for Speech and Language Technology (CSLT) at Tsinghua University. + +The origional recording was conducted in 2002 by Dong Wang, supervised by Prof. Xiaoyan Zhu, at the Key State Lab of Intelligence and System, Department of Computer Science, Tsinghua Universeity, and the original name was 'TCMSD', standing for 'Tsinghua Continuous Mandarin Speech Database'. The publication after 13 years has been initiated by Dr. Dong Wang and was supported by Prof. Xiaoyan Zhu. We hope to provide a toy database for new researchers in the field of speech recognition. Therefore, the database is totally free to academic users. + +The database can be downloaded from openslr: +http://www.openslr.org/18/ + +or from the CSLT server: +http://data.cslt.org/thchs30/README.html + diff --git a/egs/thchs30/s5/RESULTS b/egs/thchs30/s5/RESULTS new file mode 100644 index 00000000000..70718ea4c2a --- /dev/null +++ b/egs/thchs30/s5/RESULTS @@ -0,0 +1,61 @@ +#!/bin/bash +for x in exp/{mono,tri1,tri2b,tri3b,tri4b,tri4b_dnn,tri4b_dnn_mpe}/decode_test_phone* ; do [ -d $x ] && grep WER $x/wer_* | utils/best_wer.sh; done +#clean mono,tri1,tri2b,tri3b,GMM,DNN model +#clean test data +#phone task +%WER 31.49 [ 113986 / 362027, 20820 ins, 22043 del, 71123 sub ] exp/mono/decode_test_phone/wer_5 +%WER 20.56 [ 74445 / 362027, 15452 ins, 12457 del, 46536 sub ] exp/tri1/decode_test_phone/wer_5 +%WER 17.32 [ 62689 / 362027, 11937 ins, 11260 del, 39492 sub ] exp/tri2b/decode_test_phone/wer_6 +%WER 18.06 [ 65368 / 362027, 10426 ins, 13780 del, 41162 sub ] exp/tri3b/decode_test_phone/wer_5 +%WER 18.50 [ 66984 / 362027, 13117 ins, 11917 del, 41950 sub ] exp/tri3b/decode_test_phone.si/wer_5 +%WER 16.17 [ 58544 / 362027, 9628 ins, 11746 del, 37170 sub ] exp/tri4b/decode_test_phone/wer_6 +%WER 16.59 [ 60060 / 362027, 11440 ins, 10477 del, 38143 sub ] exp/tri4b/decode_test_phone.si/wer_6 +%WER 10.27 [ 37173 / 362027, 8675 ins, 6483 del, 22015 sub ] exp/tri4b_dnn/decode_test_phone/wer_4 +%WER 10.11 [ 36591 / 362027, 8702 ins, 6255 del, 21634 sub ] exp/tri4b_dnn_mpe/decode_test_phone_it1/wer_4 +%WER 10.03 [ 36321 / 362027, 7490 ins, 6731 del, 22100 sub ] exp/tri4b_dnn_mpe/decode_test_phone_it2/wer_5 +%WER 10.01 [ 36249 / 362027, 7507 ins, 6677 del, 22065 sub ] exp/tri4b_dnn_mpe/decode_test_phone_it3/wer_5 + +exit 0 + +for x in exp/{mono,tri1,tri2b,tri3b,tri4b,tri4b_dnn,tri4b_dnn_mpe}/decode_test_word* ; do [ -d $x ] && grep WER $x/wer_* | utils/best_wer.sh; done +#clean mono,tri1,tri2b,tri3b,GMM,DNN model +#clean test data +#word task +%WER 51.04 [ 41414 / 81139, 474 ins, 2404 del, 38536 sub ] exp/mono/decode_test_word/wer_9 +%WER 36.38 [ 29522 / 81139, 516 ins, 1096 del, 27910 sub ] exp/tri1/decode_test_word/wer_10 +%WER 32.51 [ 26379 / 81139, 469 ins, 940 del, 24970 sub ] exp/tri2b/decode_test_word/wer_9 +%WER 31.65 [ 25684 / 81139, 340 ins, 1085 del, 24259 sub ] exp/tri3b/decode_test_word/wer_9 +%WER 34.07 [ 27643 / 81139, 443 ins, 1100 del, 26100 sub ] exp/tri3b/decode_test_word.si/wer_10 +%WER 29.64 [ 24052 / 81139, 341 ins, 929 del, 22782 sub ] exp/tri4b/decode_test_word/wer_11 +%WER 31.71 [ 25732 / 81139, 472 ins, 902 del, 24358 sub ] exp/tri4b/decode_test_word.si/wer_10 +%WER 23.57 [ 19123 / 81139, 419 ins, 585 del, 18119 sub ] exp/tri4b_dnn/decode_test_word/wer_7 +%WER 23.40 [ 18984 / 81139, 397 ins, 567 del, 18020 sub ] exp/tri4b_dnn_mpe/decode_test_word_it1/wer_7 +%WER 23.27 [ 18884 / 81139, 396 ins, 553 del, 17935 sub ] exp/tri4b_dnn_mpe/decode_test_word_it2/wer_7 +%WER 23.18 [ 18804 / 81139, 368 ins, 618 del, 17818 sub ] exp/tri4b_dnn_mpe/decode_test_word_it3/wer_8 + +exit 0 + +for x in exp/{tri4b_dnn_mpe,tri4b_dnn_dae}/decode_phone_0db/{white,car,cafe}; do [ -d $x ] && grep WER $x/wer_* | utils/best_wer.sh; done +#clean MPE model and mixture DAE model +#0db noise test data +#phone task +%WER 84.01 [ 304141 / 362027, 717 ins, 275948 del, 27476 sub ] exp/tri4b_dnn_mpe/decode_phone_0db/white/wer_4 +%WER 14.11 [ 51074 / 362027, 10941 ins, 8175 del, 31958 sub ] exp/tri4b_dnn_mpe/decode_phone_0db/car/wer_5 +%WER 71.63 [ 259329 / 362027, 6164 ins, 217508 del, 35657 sub ] exp/tri4b_dnn_mpe/decode_phone_0db/cafe/wer_4 +%WER 40.04 [ 144946 / 362027, 17764 ins, 35162 del, 92020 sub ] exp/tri4b_dnn_dae/decode_phone_0db/white/wer_6 +%WER 11.81 [ 42773 / 362027, 9598 ins, 7552 del, 25623 sub ] exp/tri4b_dnn_dae/decode_phone_0db/car/wer_5 +%WER 32.39 [ 117256 / 362027, 17793 ins, 27750 del, 71713 sub ] exp/tri4b_dnn_dae/decode_phone_0db/cafe/wer_6 +exit 0 + +for x in exp/{tri4b_dnn_mpe,tri4b_dnn_dae}/decode_word_0db/{white,car,cafe}; do [ -d $x ] && grep WER $x/wer_* | utils/best_wer.sh; done +#clean MPE model and mixture DAE model +#0db noise test data +#word task +%WER 98.56 [ 79973 / 81139, 15 ins, 64293 del, 15665 sub ] exp/tri4b_dnn_mpe/decode_word_0db/white/wer_4 +%WER 28.10 [ 22799 / 81139, 553 ins, 661 del, 21585 sub ] exp/tri4b_dnn_mpe/decode_word_0db/car/wer_8 +%WER 85.58 [ 69438 / 81139, 321 ins, 49066 del, 20051 sub ] exp/tri4b_dnn_mpe/decode_word_0db/cafe/wer_8 +%WER 65.23 [ 52923 / 81139, 827 ins, 4198 del, 47898 sub ] exp/tri4b_dnn_dae/decode_word_0db/white/wer_13 +%WER 25.12 [ 20379 / 81139, 444 ins, 676 del, 19259 sub ] exp/tri4b_dnn_dae/decode_word_0db/car/wer_9 +%WER 53.38 [ 43308 / 81139, 907 ins, 4164 del, 38237 sub ] exp/tri4b_dnn_dae/decode_word_0db/cafe/wer_12 + +exit 0 diff --git a/egs/thchs30/s5/cmd.sh b/egs/thchs30/s5/cmd.sh new file mode 100644 index 00000000000..1d8e768790f --- /dev/null +++ b/egs/thchs30/s5/cmd.sh @@ -0,0 +1,15 @@ +# you can change cmd.sh depending on what type of queue you are using. +# If you have no queueing system and want to run on a local machine, you +# can change all instances 'queue.pl' to run.pl (but be careful and run +# commands one by one: most recipes will exhaust the memory on your +# machine). queue.pl works with GridEngine (qsub). slurm.pl works +# with slurm. Different queues are configured differently, with different +# queue names and different ways of specifying things like memory; +# to account for these differences you can create and edit the file +# conf/queue.conf to match your queue's configuration. Search for +# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, +# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. + +export train_cmd=queue.pl +export decode_cmd="queue.pl --mem 4G" +export mkgraph_cmd="queue.pl --mem 8G" diff --git a/egs/thchs30/s5/conf/decode_dnn.config b/egs/thchs30/s5/conf/decode_dnn.config new file mode 100644 index 00000000000..89dd9929a62 --- /dev/null +++ b/egs/thchs30/s5/conf/decode_dnn.config @@ -0,0 +1,2 @@ +beam=18.0 # beam for decoding. Was 13.0 in the scripts. +lattice_beam=10.0 # this has most effect on size of the lattices. diff --git a/egs/thchs30/s5/conf/fbank.conf b/egs/thchs30/s5/conf/fbank.conf new file mode 100644 index 00000000000..8e6e36c69cf --- /dev/null +++ b/egs/thchs30/s5/conf/fbank.conf @@ -0,0 +1,3 @@ +# No non-default options for now. +#--sample-frequency=8000 +--num-mel-bins=40 diff --git a/egs/thchs30/s5/conf/mfcc.conf b/egs/thchs30/s5/conf/mfcc.conf new file mode 100644 index 00000000000..47d6c48bfe5 --- /dev/null +++ b/egs/thchs30/s5/conf/mfcc.conf @@ -0,0 +1,2 @@ +--use-energy=false # only non-default option. +#--sample-frequency=8000 diff --git a/egs/thchs30/s5/local/dae/add-noise-mod.py b/egs/thchs30/s5/local/dae/add-noise-mod.py new file mode 100755 index 00000000000..33e8a297aef --- /dev/null +++ b/egs/thchs30/s5/local/dae/add-noise-mod.py @@ -0,0 +1,155 @@ +#!/usr/bin/env python +# Copyright 2016 Tsinghua University (Author: Chao Liu, Dong Wang). Apache 2.0. + + +from __future__ import print_function +import optparse +import random +import bisect +import re +import logging +import wave +import math +import struct +import sys +import os + +try: + import pyximport; pyximport.install() + from thchs30_util import * +except: + print("Cython possibly not installed, using standard python code. The process might be slow", file=sys.stderr) + + def energy(mat): + return float(sum([x * x for x in mat])) / len(mat) + + def mix(mat, noise, pos, scale): + ret = [] + l = len(noise) + for i in xrange(len(mat)): + x = mat[i] + d = int(x + scale * noise[pos]) + #if d > 32767 or d < -32768: + # logging.debug('overflow occurred!') + d = max(min(d, 32767), -32768) + ret.append(d) + pos += 1 + if pos == l: + pos = 0 + return (pos, ret) + + +def dirichlet(params): + samples = [random.gammavariate(x, 1) if x > 0 else 0. for x in params] + samples = [x / sum(samples) for x in samples] + for x in xrange(1, len(samples)): + samples[x] += samples[x - 1] + return bisect.bisect_left(samples, random.random()) + +def wave_mat(wav_filename): + f = wave.open(wav_filename, 'r') + n = f.getnframes() + ret = f.readframes(n) + f.close() + return list(struct.unpack('%dh' % n, ret)) + +def num_samples(mat): + return len(mat) + +def scp(scp_filename): + with open(scp_filename) as f: + for l in f: + yield tuple(l.strip().split()) + +def wave_header(sample_array, sample_rate): + byte_count = (len(sample_array)) * 2 # short + # write the header + hdr = struct.pack(' len(n): + noise_energies[type] = energy(n[p::]+n[0:len(n)-p:]) + else: + noise_energies[type] = energy(n[p:p+len(mat):]) + scale = math.sqrt(noise / noise_energies[type]) + logging.debug('noise scale: %f', scale) + pos, result = mix(mat, n, p, scale) + noises[type] = (pos, n) + if args.wavdir != 'NULL': + output_wave_file(args.wavdir, tag, result) + else: + output(tag, result) + +if __name__ == '__main__': + main() + + + diff --git a/egs/thchs30/s5/local/dae/run_dae.sh b/egs/thchs30/s5/local/dae/run_dae.sh new file mode 100755 index 00000000000..f6a6db3a01a --- /dev/null +++ b/egs/thchs30/s5/local/dae/run_dae.sh @@ -0,0 +1,149 @@ +#!/bin/bash +#Copyright 2016 Tsinghua University (Author: Dong Wang, Xuewei Zhang). Apache 2.0. + +#Conducts experiments of dae-based denoisng + +stage=0 +nj=8 + +. ./cmd.sh ## You'll want to change cmd.sh to something that will work on your system. + ## This relates to the queue. + +. ./path.sh ## Source the tools/utils (import the queue.pl) +. utils/parse_options.sh || exit 1; + +thchs=$1 + +#generate noisy data. We focuse on the 0db condition. +#For training set, generate noisy data with SNR mean=0, variance=10, with three noise types mixed together. +#For dev, generate noisy data with SNR mean=0, variance=0, with three niose types mixed together +#For test, use the standard test data which were generated by SNR mean=0, variance=0. + +if [ $stage = 0 ]; then + #generat noise.scp + mkdir -p data/dae/noise && \ + awk '{print $1 " '$thchs'/resource/noise/"$2}' $thchs/resource/noise/noise.scp > data/dae/noise/noise.scp || exit 1 + + echo "DAE: generate training data..." + noise_scp=data/dae/noise/noise.scp + noise_prior="0.0,10.0,10.0,10.0" #define noise type to sample. [S_clean, S_white, S_car, S_cafe] + noise_level=0 #0db condition + sigma0=10 #some random in SNR + seed=32 + verbose=0 + wavdir=wav/dae/train + rm -rf data/dae/train && mkdir -p data/dae/train || exit 1 + cp data/fbank/train/{spk2utt,utt2spk,text} data/dae/train || exit 1 + mkdir -p $wavdir && awk '{print $1 " '$wavdir'/"$1".wav"}' data/fbank/train/wav.scp > data/dae/train/wav.scp || exit 1 + + mkdir -p exp/dae/gendata + split_scps="" + for n in $(seq $nj); do + split_scps="$split_scps exp/dae/gendata/train_split_${n}.scp" + done + utils/split_scp.pl data/fbank/train/wav.scp $split_scps || exit 1 + $train_cmd JOB=1:$nj exp/dae/gendata/add_noise_train.JOB.log \ + local/dae/add-noise-mod.py --noise-level $noise_level \ + --sigma0 $sigma0 --seed $seed --verbose $verbose \ + --noise-prior $noise_prior --noise-src $noise_scp \ + --wav-src exp/dae/gendata/train_split_JOB.scp --wavdir $wavdir \ + || exit 1 + + steps/make_fbank.sh --nj $nj --cmd "$train_cmd" \ + data/dae/train exp/dae/gendata fbank/dae/train || exit 1 + steps/compute_cmvn_stats.sh data/dae/train exp/dae/cmvn \ + fbank/dae/train || exit 1 + + #genreate dev data. Just the 0db condition is produced. Multiple noise types mixed together. + echo "DAE: generating dev data..." + wavdir=wav/dae/dev/0db + sigma0=0 #no random in SNR + rm -rf data/dae/dev/0db && mkdir -p data/dae/dev/0db && \ + cp -L data/fbank/dev/{spk2utt,utt2spk,text} data/dae/dev/0db || exit 1 + mkdir -p $wavdir && awk '{print $1 " '$wavdir'/"$1".wav"}' data/fbank/dev/wav.scp > data/dae/dev/0db/wav.scp || exit 1 + + split_scps="" + for n in $(seq $nj); do + split_scps="$split_scps exp/dae/gendata/dev_split_${n}.scp" + done + utils/split_scp.pl data/fbank/dev/wav.scp $split_scps || exit 1 + + $train_cmd JOB=1:$nj exp/dae/gendata/add_noise_dev.JOB.log \ + local/dae/add-noise-mod.py --noise-level $noise_level \ + --sigma0 $sigma0 --seed $seed --verbose $verbose \ + --noise-prior $noise_prior --noise-src $noise_scp \ + --wav-src exp/dae/gendata/dev_split_JOB.scp --wavdir $wavdir \ + || exit 1 + steps/make_fbank.sh --nj $nj --cmd "$train_cmd" \ + data/dae/dev/0db exp/dae/gendata fbank/dae/dev/0db || exit 1 + steps/compute_cmvn_stats.sh data/dae/dev/0db exp/dae/cmvn \ + fbank/dae/dev/0db || exit 1 + + #generate test data. Assume it has been downloaded in $thchs/test-noise + echo "DAE: generating test data..." + #generate fbank + for x in car white cafe; do + echo "producing fbanks for $x" + mkdir -p data/dae/test/0db/$x && \ + cp -L data/fbank/test/{spk2utt,utt2spk,text} data/dae/test/0db/$x && \ + awk '{print $1 " '$thchs'/test-noise/0db/'$x'/"$1".wav"}' data/fbank/test/wav.scp > data/dae/test/0db/$x/wav.scp || exit 1 + steps/make_fbank.sh --nj $nj --cmd "$train_cmd" \ + data/dae/test/0db/$x exp/dae/gendata fbank/dae/test/0db/$x || exit 1 + echo "generating cmvn for test data $x" + steps/compute_cmvn_stats.sh data/dae/test/0db/$x exp/dae/cmvn \ + fbank/dae/test/0db/$x || exit 1 + cp -R data/dae/test/0db/$x data/dae/test/0db/${x}_phone && cp data/test/phone.txt data/dae/test/0db/${x}_phone/text || exit 1 + done +fi + +#DAE training +if [ $stage -le 1 ]; then + #train dnn dae using data with mixed noise + #produce merged feats.scp as --labels for both training and cv + dir=exp/tri4b_dnn_dae && mkdir -p exp/tri4b_dnn_dae || exit 1 + cat data/fbank/train/feats.scp data/fbank/dev/feats.scp | sort -u > $dir/tgt_feats.scp + cat data/fbank/train/cmvn.scp data/fbank/dev/cmvn.scp | sort -u > $dir/tgt_cmvn.scp + + num_fea=$(feat-to-dim scp:$dir/tgt_feats.scp -) + echo "num_fea = $num_fea" + + $cuda_cmd exp/tri4b_dnn_dae/log/train_nnet.log \ + steps/nnet/train.sh --hid-layers 2 --hid-dim 1200 \ + --cmvn-opts "--norm-vars=false" --splice 10 \ + --learn-rate 0.0001 \ + --train_tool_opts "--objective-function=mse" \ + --copy_feats false \ + --labels "ark:copy-feats scp:$dir/tgt_feats.scp ark:- | apply-cmvn --norm-vars=false scp:$dir/tgt_cmvn.scp ark:- ark:- | feat-to-post ark:- ark:-|" \ + --num-tgt $num_fea \ + --proto-opts '--no-softmax ' \ + data/dae/train data/dae/dev/0db data/lang \ + data/fbank/train data/fbank/dev \ + exp/tri4b_dnn_dae || exit 1; + nnet-concat exp/tri4b_dnn_dae/final.feature_transform exp/tri4b_dnn_dae/final.nnet \ + exp/tri4b_dnn_mpe/final.feature_transform exp/tri4b_dnn_dae/dae.nnet || exit 1 + +fi + +#decoding +if [ $stage -le 2 ]; then + for x in car white cafe; do + ( + #decode word + steps/nnet/decode.sh --cmd "$decode_cmd" --nj $nj \ + --srcdir exp/tri4b_dnn_mpe \ + exp/tri4b/graph_word data/dae/test/0db/$x exp/tri4b_dnn_mpe/decode_word_0db/$x || exit 1; + steps/nnet/decode.sh --cmd "$decode_cmd" --nj $nj \ + --srcdir exp/tri4b_dnn_mpe --feature-transform exp/tri4b_dnn_dae/dae.nnet \ + exp/tri4b/graph_word data/dae/test/0db/$x exp/tri4b_dnn_dae/decode_word_0db/$x || exit 1; + + #decode phone + steps/nnet/decode.sh --cmd "$decode_cmd" --nj $nj \ + --srcdir exp/tri4b_dnn_mpe \ + exp/tri4b/graph_phone data/dae/test/0db/${x}_phone exp/tri4b_dnn_mpe/decode_phone_0db/$x || exit 1; + steps/nnet/decode.sh --cmd "$decode_cmd" --nj $nj \ + --srcdir exp/tri4b_dnn_mpe --feature-transform exp/tri4b_dnn_dae/dae.nnet \ + exp/tri4b/graph_phone data/dae/test/0db/${x}_phone exp/tri4b_dnn_dae/decode_phone_0db/$x || exit 1; + ) & + done +fi + diff --git a/egs/thchs30/s5/local/dae/thchs30_util.pyx b/egs/thchs30/s5/local/dae/thchs30_util.pyx new file mode 100755 index 00000000000..281ff166032 --- /dev/null +++ b/egs/thchs30/s5/local/dae/thchs30_util.pyx @@ -0,0 +1,27 @@ +# Copyright 2016 Tsinghua University (Author: Chao Liu). Apache 2.0. + +def energy(list mat): + cdef float e + cdef int i, j, l + l = len(mat) + for i in range(l): + j = mat[i] + e += j * j + e /= l + return e + +def mix(list mat, list noise, int pos, double scale): + cdef len_noise, len_mat, i, x, y + ret = [] + len_noise = len(noise) + len_mat = len(mat) + for i in range(len_mat): + x = mat[i] + y = int(x + scale * noise[pos]) + if y > 32767: + y = 32767 + elif y < -32768: + y = -32768 + ret.append(y) + pos = (pos + 1) % len_noise + return pos, ret diff --git a/egs/thchs30/s5/local/download_and_untar.sh b/egs/thchs30/s5/local/download_and_untar.sh new file mode 100755 index 00000000000..655e674dc9b --- /dev/null +++ b/egs/thchs30/s5/local/download_and_untar.sh @@ -0,0 +1,99 @@ +#!/bin/bash + +# Copyright 2014 Johns Hopkins University (author: Daniel Povey) +# Copyright 2016 Tsinghua University (author: Dong Wang) +# Apache 2.0 + +# Adapted from librispeech recipe local/download_and_untar.sh + +remove_archive=false + +if [ "$1" == --remove-archive ]; then + remove_archive=true + shift +fi + +if [ $# -ne 3 ]; then + echo "Usage: $0 [--remove-archive] " + echo "e.g.: $0 /nfs/public/materials/data/thchs30-openslr www.openslr.org/resources/18 data_thchs30" + echo "With --remove-archive it will remove the archive after successfully un-tarring it." + echo " can be one of: data_thchs30, test-noise, resource" +fi + +data=$1 +url=$2 +part=$3 + +if [ ! -d "$data" ]; then + echo "$0: no such directory $data" + exit 1; +fi + +part_ok=false +list="data_thchs30 test-noise resource" +for x in $list; do + if [ "$part" == $x ]; then part_ok=true; fi +done +if ! $part_ok; then + echo "$0: expected to be one of $list, but got '$part'" + exit 1; +fi + +if [ -z "$url" ]; then + echo "$0: empty URL base." + exit 1; +fi + +if [ -f $data/$part/.complete ]; then + echo "$0: data part $part was already successfully extracted, nothing to do." + exit 0; +fi + + +sizes="6453425169 1971460210 24813708" + +if [ -f $data/$part.tgz ]; then + size=$(/bin/ls -l $data/$part.tgz | awk '{print $5}') + size_ok=false + for s in $sizes; do if [ $s == $size ]; then size_ok=true; fi; done + if ! $size_ok; then + echo "$0: removing existing file $data/$part.tgz because its size in bytes $size" + echo "does not equal the size of one of the archives." + rm $data/$part.tgz + else + echo "$data/$part.tgz exists and appears to be complete." + fi +fi + +if [ ! -f $data/$part.tgz ]; then + if ! which wget >/dev/null; then + echo "$0: wget is not installed." + exit 1; + fi + full_url=$url/$part.tgz + echo "$0: downloading data from $full_url. This may take some time, please be patient." + + cd $data + pwd + echo " wget --no-check-certificate $full_url" + if ! wget --no-check-certificate $full_url; then + echo "$0: error executing wget $full_url" + exit 1; + fi +fi + +cd $data + +if ! tar -xvzf $part.tgz; then + echo "$0: error un-tarring archive $data/$part.tgz" + exit 1; +fi + +touch $data/$part/.complete + +echo "$0: Successfully downloaded and un-tarred $data/$part.tgz" + +if $remove_archive; then + echo "$0: removing $data/$part.tgz file since --remove-archive option was supplied." + rm $data/$part.tgz +fi diff --git a/egs/thchs30/s5/local/nnet/run_dnn.sh b/egs/thchs30/s5/local/nnet/run_dnn.sh new file mode 100755 index 00000000000..d40f48e3609 --- /dev/null +++ b/egs/thchs30/s5/local/nnet/run_dnn.sh @@ -0,0 +1,90 @@ +#!/bin/bash +#Copyright 2016 Tsinghua University (Author: Dong Wang, Xuewei Zhang). Apache 2.0. + +#run from ../.. +#DNN training, both xent and MPE + + +. ./cmd.sh ## You'll want to change cmd.sh to something that will work on your system. + ## This relates to the queue. + +. ./path.sh ## Source the tools/utils (import the queue.pl) + +stage=0 +nj=8 + +. utils/parse_options.sh || exit 1; + +gmmdir=$1 +alidir=$2 +alidir_cv=$3 + +#generate fbanks +if [ $stage -le 0 ]; then + echo "DNN training: stage 0: feature generation" + rm -rf data/fbank && mkdir -p data/fbank && cp -R data/{train,dev,test,test_phone} data/fbank || exit 1; + for x in train dev test; do + echo "producing fbank for $x" + #fbank generation + steps/make_fbank.sh --nj $nj --cmd "$train_cmd" data/fbank/$x exp/make_fbank/$x fbank/$x || exit 1 + #ompute cmvn + steps/compute_cmvn_stats.sh data/fbank/$x exp/fbank_cmvn/$x fbank/$x || exit 1 + done + + echo "producing test_fbank_phone" + cp data/fbank/test/feats.scp data/fbank/test_phone && cp data/fbank/test/cmvn.scp data/fbank/test_phone || exit 1; + +fi + + +#xEnt training +if [ $stage -le 1 ]; then + outdir=exp/tri4b_dnn + #NN training + (tail --pid=$$ -F $outdir/log/train_nnet.log 2>/dev/null)& # forward log + $cuda_cmd $outdir/log/train_nnet.log \ + steps/nnet/train.sh --copy_feats false --cmvn-opts "--norm-means=true --norm-vars=false" --hid-layers 4 --hid-dim 1024 \ + --learn-rate 0.008 data/fbank/train data/fbank/dev data/lang $alidir $alidir_cv $outdir || exit 1; + #Decode (reuse HCLG graph in gmmdir) + ( + steps/nnet/decode.sh --nj $nj --cmd "$decode_cmd" --srcdir $outdir --config conf/decode_dnn.config --acwt 0.1 \ + $gmmdir/graph_word data/fbank/test $outdir/decode_test_word || exit 1; + )& + ( + steps/nnet/decode.sh --nj $nj --cmd "$decode_cmd" --srcdir $outdir --config conf/decode_dnn.config --acwt 0.1 \ + $gmmdir/graph_phone data/fbank/test_phone $outdir/decode_test_phone || exit 1; + )& + +fi + +#MPE training + +srcdir=exp/tri4b_dnn +acwt=0.1 + +if [ $stage -le 2 ]; then + # generate lattices and alignments + steps/nnet/align.sh --nj $nj --cmd "$train_cmd" \ + data/fbank/train data/lang $srcdir ${srcdir}_ali || exit 1; + steps/nnet/make_denlats.sh --nj $nj --cmd "$decode_cmd" --config conf/decode_dnn.config --acwt $acwt \ + data/fbank/train data/lang $srcdir ${srcdir}_denlats || exit 1; +fi + +if [ $stage -le 3 ]; then + outdir=exp/tri4b_dnn_mpe + #Re-train the DNN by 3 iteration of MPE + steps/nnet/train_mpe.sh --cmd "$cuda_cmd" --num-iters 3 --acwt $acwt --do-smbr false \ + data/fbank/train data/lang $srcdir ${srcdir}_ali ${srcdir}_denlats $outdir || exit 1 + #Decode (reuse HCLG graph) + for ITER in 3 2 1; do + ( + steps/nnet/decode.sh --nj $nj --cmd "$decode_cmd" --nnet $outdir/${ITER}.nnet --config conf/decode_dnn.config --acwt $acwt \ + $gmmdir/graph_word data/fbank/test $outdir/decode_test_word_it${ITER} || exit 1; + )& + ( + steps/nnet/decode.sh --nj $nj --cmd "$decode_cmd" --nnet $outdir/${ITER}.nnet --config conf/decode_dnn.config --acwt $acwt \ + $gmmdir/graph_phone data/fbank/test_phone $outdir/decode_test_phone_it${ITER} || exit 1; + )& + done +fi + diff --git a/egs/thchs30/s5/local/score.sh b/egs/thchs30/s5/local/score.sh new file mode 120000 index 00000000000..0afefc3158c --- /dev/null +++ b/egs/thchs30/s5/local/score.sh @@ -0,0 +1 @@ +../steps/score_kaldi.sh \ No newline at end of file diff --git a/egs/thchs30/s5/local/thchs-30_data_prep.sh b/egs/thchs30/s5/local/thchs-30_data_prep.sh new file mode 100755 index 00000000000..7a85274ce83 --- /dev/null +++ b/egs/thchs30/s5/local/thchs-30_data_prep.sh @@ -0,0 +1,39 @@ +#!/bin/bash +#Copyright 2016 Tsinghua University (Author: Dong Wang, Xuewei Zhang). Apache 2.0. + +#This script pepares the data directory for thchs30 recipe. +#It reads the corpus and get wav.scp and transcriptions. + +dir=$1 +corpus_dir=$2 + + +cd $dir + +echo "creating data/{train,dev,test}" +mkdir -p data/{train,dev,test} + +#create wav.scp, utt2spk.scp, spk2utt.scp, text +( +for x in train dev test; do + echo "cleaning data/$x" + cd $dir/data/$x + rm -rf wav.scp utt2spk spk2utt word.txt phone.txt text + echo "preparing scps and text in data/$x" + for nn in `find $corpus_dir/$x/*.wav | sort -u | xargs -i basename {} .wav`; do + echo $nn $corpus_dir/$x/$nn.wav >> wav.scp + echo $nn $nn >> utt2spk + echo $nn $nn >> spk2utt + echo $nn `sed -n 1p $corpus_dir/data/$nn.wav.trn` >> word.txt + echo $nn `sed -n 3p $corpus_dir/data/$nn.wav.trn` >> phone.txt + done + cp word.txt text +done +) || exit 1 + +echo "creating test_phone for phone decoding" +( + rm -rf data/test_phone && cp -R data/test data/test_phone || exit 1 + cd data/test_phone && rm text && cp phone.txt text || exit 1 +) + diff --git a/egs/thchs30/s5/local/thchs-30_decode.sh b/egs/thchs30/s5/local/thchs-30_decode.sh new file mode 100755 index 00000000000..f9661f61f21 --- /dev/null +++ b/egs/thchs30/s5/local/thchs-30_decode.sh @@ -0,0 +1,34 @@ +#!/bin/bash +#Copyright 2016 Tsinghua University (Author: Dong Wang, Xuewei Zhang). Apache 2.0. + +#decoding wrapper for thchs30 recipe +#run from ../ + +nj=8 +mono=false + +. ./cmd.sh ## You'll want to change cmd.sh to something that will work on your system. + ## This relates to the queue. + +. ./path.sh ## Source the tools/utils (import the queue.pl) + +. utils/parse_options.sh || exit 1; +decoder=$1 +srcdir=$2 +datadir=$3 + + +if [ $mono = true ];then + echo "using monophone to generate graph" + opt="--mono" +fi + +#decode word +utils/mkgraph.sh $opt data/graph/lang $srcdir $srcdir/graph_word || exit 1; +$decoder --cmd "$decode_cmd" --nj $nj $srcdir/graph_word $datadir/test $srcdir/decode_test_word || exit 1 + +#decode phone +utils/mkgraph.sh $opt data/graph_phone/lang $srcdir $srcdir/graph_phone || exit 1; +$decoder --cmd "$decode_cmd" --nj $nj $srcdir/graph_phone $datadir/test_phone $srcdir/decode_test_phone || exit 1 + + diff --git a/egs/thchs30/s5/local/wer_output_filter b/egs/thchs30/s5/local/wer_output_filter new file mode 100755 index 00000000000..1ccb651a258 --- /dev/null +++ b/egs/thchs30/s5/local/wer_output_filter @@ -0,0 +1,19 @@ +#!/usr/bin/env python +#Copyright 2016 Tsinghua University (Author: Dong Wang). Apache 2.0. + +#This script accepts a Chinese stream and inserts blanks between Chinese characters +#Used to prepare character-based transcriptions and compute CER. + +from __future__ import print_function +import sys + +for l in sys.stdin: + l=l.strip() + ll=l.split() + lk=ll[0] + for v in ll[1:]: + v = v.decode('utf-8') + for i in v: + lk= lk + ' ' + i + + print (lk.encode('utf-8')) diff --git a/egs/thchs30/s5/path.sh b/egs/thchs30/s5/path.sh new file mode 100755 index 00000000000..fb1c0489386 --- /dev/null +++ b/egs/thchs30/s5/path.sh @@ -0,0 +1,7 @@ +export KALDI_ROOT=`pwd`/../../.. +[ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh +export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH +[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1 +. $KALDI_ROOT/tools/config/common_path.sh +export LC_ALL=C + diff --git a/egs/thchs30/s5/run.sh b/egs/thchs30/s5/run.sh new file mode 100755 index 00000000000..24645f59e83 --- /dev/null +++ b/egs/thchs30/s5/run.sh @@ -0,0 +1,112 @@ +#!/bin/bash + +. ./cmd.sh ## You'll want to change cmd.sh to something that will work on your system. + ## This relates to the queue. +. ./path.sh + +H=`pwd` #exp home +n=8 #parallel jobs + +#corpus and trans directory +thchs=/nfs/public/materials/data/thchs30-openslr + +#you can obtain the database by uncommting the following lines +#[ -d $thchs ] || mkdir -p $thchs || exit 1 +#echo "downloading THCHS30 at $thchs ..." +#local/download_and_untar.sh $thchs http://www.openslr.org/resources/18 data_thchs30 || exit 1 +#local/download_and_untar.sh $thchs http://www.openslr.org/resources/18 resource || exit 1 +#local/download_and_untar.sh $thchs http://www.openslr.org/resources/18 test-noise || exit 1 + +#data preparation +#generate text, wav.scp, utt2pk, spk2utt +local/thchs-30_data_prep.sh $H $thchs/data_thchs30 || exit 1; + +#produce MFCC features +rm -rf data/mfcc && mkdir -p data/mfcc && cp -R data/{train,dev,test,test_phone} data/mfcc || exit 1; +for x in train dev test; do + #make mfcc + steps/make_mfcc.sh --nj $n --cmd "$train_cmd" data/mfcc/$x exp/make_mfcc/$x mfcc/$x || exit 1; + #compute cmvn + steps/compute_cmvn_stats.sh data/mfcc/$x exp/mfcc_cmvn/$x mfcc/$x || exit 1; +done +#copy feats and cmvn to test.ph, avoid duplicated mfcc & cmvn +cp data/mfcc/test/feats.scp data/mfcc/test_phone && cp data/mfcc/test/cmvn.scp data/mfcc/test_phone || exit 1; + + +#prepare language stuff +#build a large lexicon that invovles words in both the training and decoding. +( + echo "make word graph ..." + cd $H; mkdir -p data/{dict,lang,graph} && \ + cp $thchs/resource/dict/{extra_questions.txt,nonsilence_phones.txt,optional_silence.txt,silence_phones.txt} data/dict && \ + cat $thchs/resource/dict/lexicon.txt $thchs/data_thchs30/lm_word/lexicon.txt | \ + grep -v '' | grep -v '' | sort -u > data/dict/lexicon.txt || exit 1; + utils/prepare_lang.sh --position_dependent_phones false data/dict "" data/local/lang data/lang || exit 1; + gzip -c $thchs/data_thchs30/lm_word/word.3gram.lm > data/graph/word.3gram.lm.gz || exit 1; + utils/format_lm.sh data/lang data/graph/word.3gram.lm.gz $thchs/data_thchs30/lm_word/lexicon.txt data/graph/lang || exit 1; +) + +#make_phone_graph +( + echo "make phone graph ..." + cd $H; mkdir -p data/{dict_phone,graph_phone,lang_phone} && \ + cp $thchs/resource/dict/{extra_questions.txt,nonsilence_phones.txt,optional_silence.txt,silence_phones.txt} data/dict_phone && \ + cat $thchs/data_thchs30/lm_phone/lexicon.txt | grep -v '' | sort -u > data/dict_phone/lexicon.txt && \ + echo " sil " >> data/dict_phone/lexicon.txt || exit 1; + utils/prepare_lang.sh --position_dependent_phones false data/dict_phone "" data/local/lang_phone data/lang_phone || exit 1; + gzip -c $thchs/data_thchs30/lm_phone/phone.3gram.lm > data/graph_phone/phone.3gram.lm.gz || exit 1; + utils/format_lm.sh data/lang_phone data/graph_phone/phone.3gram.lm.gz $thchs/data_thchs30/lm_phone/lexicon.txt \ + data/graph_phone/lang || exit 1; +) + +#monophone +steps/train_mono.sh --boost-silence 1.25 --nj $n --cmd "$train_cmd" data/mfcc/train data/lang exp/mono || exit 1; +#test monophone model +local/thchs-30_decode.sh --mono true --nj $n "steps/decode.sh" exp/mono data/mfcc & + +#monophone_ali +steps/align_si.sh --boost-silence 1.25 --nj $n --cmd "$train_cmd" data/mfcc/train data/lang exp/mono exp/mono_ali || exit 1; + +#triphone +steps/train_deltas.sh --boost-silence 1.25 --cmd "$train_cmd" 2000 10000 data/mfcc/train data/lang exp/mono_ali exp/tri1 || exit 1; +#test tri1 model +local/thchs-30_decode.sh --nj $n "steps/decode.sh" exp/tri1 data/mfcc & + +#triphone_ali +steps/align_si.sh --nj $n --cmd "$train_cmd" data/mfcc/train data/lang exp/tri1 exp/tri1_ali || exit 1; + +#lda_mllt +steps/train_lda_mllt.sh --cmd "$train_cmd" --splice-opts "--left-context=3 --right-context=3" 2500 15000 data/mfcc/train data/lang exp/tri1_ali exp/tri2b || exit 1; +#test tri2b model +local/thchs-30_decode.sh --nj $n "steps/decode.sh" exp/tri2b data/mfcc & + + +#lda_mllt_ali +steps/align_si.sh --nj $n --cmd "$train_cmd" --use-graphs true data/mfcc/train data/lang exp/tri2b exp/tri2b_ali || exit 1; + +#sat +steps/train_sat.sh --cmd "$train_cmd" 2500 15000 data/mfcc/train data/lang exp/tri2b_ali exp/tri3b || exit 1; +#test tri3b model +local/thchs-30_decode.sh --nj $n "steps/decode_fmllr.sh" exp/tri3b data/mfcc & + +#sat_ali +steps/align_fmllr.sh --nj $n --cmd "$train_cmd" data/mfcc/train data/lang exp/tri3b exp/tri3b_ali || exit 1; + +#quick +steps/train_quick.sh --cmd "$train_cmd" 4200 40000 data/mfcc/train data/lang exp/tri3b_ali exp/tri4b || exit 1; +#test tri4b model +local/thchs-30_decode.sh --nj $n "steps/decode_fmllr.sh" exp/tri4b data/mfcc & + +#quick_ali +steps/align_fmllr.sh --nj $n --cmd "$train_cmd" data/mfcc/train data/lang exp/tri4b exp/tri4b_ali || exit 1; + +#quick_ali_cv +steps/align_fmllr.sh --nj $n --cmd "$train_cmd" data/mfcc/dev data/lang exp/tri4b exp/tri4b_ali_cv || exit 1; + +#train dnn model +local/nnet/run_dnn.sh --stage 0 --nj $n exp/tri4b exp/tri4b_ali exp/tri4b_ali_cv || exit 1; + +#train dae model +#python2.6 or above is required for noisy data generation. +#To speed up the process, pyximport for python is recommeded. +local/dae/run_dae.sh --stage 0 $thchs || exit 1; diff --git a/egs/thchs30/s5/steps b/egs/thchs30/s5/steps new file mode 120000 index 00000000000..1b186770dd1 --- /dev/null +++ b/egs/thchs30/s5/steps @@ -0,0 +1 @@ +../../wsj/s5/steps/ \ No newline at end of file diff --git a/egs/thchs30/s5/utils b/egs/thchs30/s5/utils new file mode 120000 index 00000000000..a3279dc8679 --- /dev/null +++ b/egs/thchs30/s5/utils @@ -0,0 +1 @@ +../../wsj/s5/utils/ \ No newline at end of file diff --git a/egs/tidigits/s5/cmd.sh b/egs/tidigits/s5/cmd.sh index c8f0d9d67a7..71dd849a93b 100644 --- a/egs/tidigits/s5/cmd.sh +++ b/egs/tidigits/s5/cmd.sh @@ -1,14 +1,15 @@ -# "queue.pl" uses qsub. The options to it are -# options to qsub. If you have GridEngine installed, -# change this to a queue you have access to. -# Otherwise, use "run.pl", which will run jobs locally -# (make sure your --num-jobs options are no more than -# the number of cpus on your machine. - -#export train_cmd="queue.pl -q all.q@a*.clsp.jhu.edu" -export decode_cmd="queue.pl -q all.q@a*.clsp.jhu.edu" -export train_cmd=run.pl -#export decode_cmd=run.pl - - - +# you can change cmd.sh depending on what type of queue you are using. +# If you have no queueing system and want to run on a local machine, you +# can change all instances 'queue.pl' to run.pl (but be careful and run +# commands one by one: most recipes will exhaust the memory on your +# machine). queue.pl works with GridEngine (qsub). slurm.pl works +# with slurm. Different queues are configured differently, with different +# queue names and different ways of specifying things like memory; +# to account for these differences you can create and edit the file +# conf/queue.conf to match your queue's configuration. Search for +# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, +# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. + +export train_cmd="queue.pl --mem 2G" +export decode_cmd="queue.pl --mem 4G" +export mkgraph_cmd="queue.pl --mem 8G" diff --git a/egs/tidigits/s5/local/tidigits_prepare_lang.sh b/egs/tidigits/s5/local/tidigits_prepare_lang.sh index ff316514fc9..0bc08ab40a0 100755 --- a/egs/tidigits/s5/local/tidigits_prepare_lang.sh +++ b/egs/tidigits/s5/local/tidigits_prepare_lang.sh @@ -88,10 +88,11 @@ utils/make_lexicon_fst.pl $tmpdir/lexicon.txt 0.5 sil | \ cp $lang/L.fst $lang/L_disambig.fst -silphonelist=`cat $lang/phones/silence.csl | sed 's/:/ /g'` -nonsilphonelist=`cat $lang/phones/nonsilence.csl | sed 's/:/ /g'` -cat conf/topo.proto | sed "s:NONSILENCEPHONES:$nonsilphonelist:" | \ - sed "s:SILENCEPHONES:$silphonelist:" > $lang/topo +num_sil_states=5 +num_nonsil_states=3 +silphonelist=`cat $lang/phones/silence.csl` +nonsilphonelist=`cat $lang/phones/nonsilence.csl` +utils/gen_topo.pl $num_nonsil_states $num_sil_states $nonsilphonelist $silphonelist >$lang/topo # Now we prepare a simple grammar G.fst that's a kind of loop of # digits (no silence in this, since that's handled in L.fst) diff --git a/egs/tidigits/s5/path.sh b/egs/tidigits/s5/path.sh index 3ee46078956..2d17b17a84a 100755 --- a/egs/tidigits/s5/path.sh +++ b/egs/tidigits/s5/path.sh @@ -1,3 +1,6 @@ export KALDI_ROOT=`pwd`/../../.. -export PATH=$PWD/utils/:$KALDI_ROOT/src/bin:$KALDI_ROOT/tools/openfst/bin:$KALDI_ROOT/src/fstbin/:$KALDI_ROOT/src/gmmbin/:$KALDI_ROOT/src/featbin/:$KALDI_ROOT/src/lm/:$KALDI_ROOT/src/sgmmbin/:$KALDI_ROOT/src/sgmm2bin/:$KALDI_ROOT/src/fgmmbin/:$KALDI_ROOT/src/latbin/:$PWD:$PATH +[ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh +export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH +[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1 +. $KALDI_ROOT/tools/config/common_path.sh export LC_ALL=C diff --git a/egs/timit/README.txt b/egs/timit/README.txt index 7e5bfa8e82d..f8ca39c4fc9 100644 --- a/egs/timit/README.txt +++ b/egs/timit/README.txt @@ -21,15 +21,14 @@ About TIMIT: Each subdirectory of this directory contains the scripts for a sequence of experiments. - s3: Monophone GMM/HMM system trained with Maximum likelihood. Training - is done with 61 phonemes, that are collapsed down to 39 phoneme - during testing. Implemented by Navdeep Jaitly (ndjaitly@cs.toronto.edu) - [from Dan: I believe this is now somewhat out of date, please us s5/] - - s4: Monophone, Triphone GMM/HMM systems trained with Maximum Likelihood. - Training is done on 48 phonemes (see- Lee and Hon: Speaker-Independent + s5: Monophone, Triphone GMM/HMM systems trained with Maximum Likelihood, + followed by SGMM and DNN recipe. + Training is done on 48 phonemes (see- Lee and Hon: Speaker-Independent Phone Recognition Using Hidden Markov Models. IEEE TRANSACTIONS ON ACOUSTICS. SPEECH, AND SIGNAL PROCESSING, VOL. 31. NO. 11, PG. 1641-48, - NOVEMBER 1989, ). Implemented by Arnab Ghoshal (arnab13@gmail.com) + NOVEMBER 1989, ). In scoring we map to 39 phonememes, as is usually + done in conference papers. + The earlier versions of TIMIT scripts were implemented by Navdeep Jaitly, + Arnab Ghoshal. Current version was developed by Bagher BabaAli and is + maintained by Karel Vesely (vesis84@gmail.com). - s5: the currently recommended recipe. diff --git a/egs/timit/s3/RESULTS b/egs/timit/s3/RESULTS deleted file mode 100644 index aeb53d8a5c2..00000000000 --- a/egs/timit/s3/RESULTS +++ /dev/null @@ -1,11 +0,0 @@ -# dev set -#compute-wer --mode=present ark:- ark,p:tmp -#%WER 34.42 [ 5003 / 14534, 218 ins, 1974 del, 2811 sub ] -#%SER 100.00 [ 400 / 400 ] -#Scored 400 sentences, 0 not present in hyp. -# test set -#compute-wer --mode=present ark:- ark,p:tmp -#%WER 35.67 [ 2479 / 6949, 98 ins, 1009 del, 1372 sub ] -#%SER 100.00 [ 192 / 192 ] - - diff --git a/egs/timit/s3/conf/plp.conf b/egs/timit/s3/conf/plp.conf deleted file mode 100644 index c4b73674cab..00000000000 --- a/egs/timit/s3/conf/plp.conf +++ /dev/null @@ -1,2 +0,0 @@ -# No non-default options for now. - diff --git a/egs/timit/s3/conf/topo.proto b/egs/timit/s3/conf/topo.proto deleted file mode 100644 index 14a6da73983..00000000000 --- a/egs/timit/s3/conf/topo.proto +++ /dev/null @@ -1,22 +0,0 @@ - - - -NONSILENCEPHONES - - 0 0 0 0.75 1 0.25 - 1 1 1 0.75 2 0.25 - 2 2 2 0.75 3 0.25 - 3 - - - -SILENCEPHONES - - 0 0 0 0.25 1 0.25 2 0.25 3 0.25 - 1 1 1 0.25 2 0.25 3 0.25 4 0.25 - 2 2 1 0.25 2 0.25 3 0.25 4 0.25 - 3 3 1 0.25 2 0.25 3 0.25 4 0.25 - 4 4 4 0.25 5 0.75 - 5 - - diff --git a/egs/timit/s3/local/create_biphone_lm.sh b/egs/timit/s3/local/create_biphone_lm.sh deleted file mode 100755 index 2c4c84dba2e..00000000000 --- a/egs/timit/s3/local/create_biphone_lm.sh +++ /dev/null @@ -1,177 +0,0 @@ -# Copyright 2012 Navdeep Jaitly - -# Is mostly a cut and paste operation, derived from -# ../../../tools/kaldi_lm/train_lm.sh to create an lm for -# biphone/bigram language models, which train_lm.sh does not -# deign to do. - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - - -# note: output is -# data/local/lm/3gram-mincount/lm_unpruned.gz -# Expects train.gz, word_map in [argument 1 folder]. -# Call from local/timit_train_lms.sh. - -if [ $# != 1 ]; then - echo "Usage: ../../local/create_biphone_lm.sh [lm folder]" - echo "eg: ../../local/create_biphone_lm.sh data/local" - exit 1; -fi - - -export PATH=$PATH:`pwd`/../../../tools/kaldi_lm -dir=$1 - -requirements="$dir/train.gz $dir/word_map" -for f in $requirements; do - if [ ! -f $f ]; then - echo "create_biphone_lm.sh: no such file $f" - exit 1; - fi -done - -echo "Training biphone language model in folder $dir" -subdir=$dir/biphone -echo "Creating directory $subdir" -mkdir -p $subdir - -# Clearly we don't have enough data to build a properly cross validated back-off model. -# In addition there is no need for a backoff model since we have all bigrams in the -# training data. However, taking out some of the data for validation set may remove -# some of the bigrams. This may seem like a bad thing, but could be a good thing if -# the resulting smoothing helps. - -heldout_sent=300 -write_arpa=1 - -if [ -s $subdir/ngrams.gz -a -s $subdir/heldout_ngrams.gz ]; then - echo "Not creating raw N-gram counts ngrams.gz and heldout_ngrams.gz since they already exist in $subdir" - echo "(remove them if you want them regenerated)" -else - echo Getting raw N-gram counts - - gunzip -c $dir/train.gz | tail -n +$heldout_sent | get_raw_ngrams 2 | sort | uniq -c |\ - uniq_to_ngrams | sort | gzip -c > $subdir/ngrams.gz - # Note: the Perl command below adds ":" before the count, which - # is a marker that these N-grams are test N-grams. - gunzip -c $dir/train.gz | head -n $heldout_sent | \ - get_raw_ngrams 2 | sort | uniq -c | uniq_to_ngrams | \ - perl -ane 's/(\S+)$/:$1/; print;' | sort | gzip -c > $subdir/heldout_ngrams.gz -fi - -cat > $subdir/config.0 < $subdir/config.diff_1 < $subdir/config.diff_2 < $subdir/config.diff_3 < $subdir/config.diff_4 < $subdir/config.diff_5 < $subdir/config.diff_6 < $subdir/config.diff_7 < $dir/wordlist.mapped - -# Define a subroutine -get_perplexity() { # echoes the perplexity to stdout. uses current "$config" as config - time gunzip -c $subdir/ngrams.gz | \ - discount_ngrams "$config" | sort | merge_ngrams | \ - interpolate_ngrams $dir/wordlist.mapped 0.5 | sort | \ - sort -m <(gunzip -c $subdir/heldout_ngrams.gz) - | compute_perplexity -} - -mkdir -p $subdir/configs/ $subdir/perplexities/ - -if [ -f $subdir/config.$num_configs ]; then - echo Not doing optimization of discounting parameters since - echo file $subdir/config.$num_configs already exists -else - for n in `seq 1 $num_configs`; do - echo "Iteration $n/$num_configs of optimizing discounting parameters" - for alpha in -0.25 0.0 0.35; do - config=$subdir/configs/config.$n.$alpha - # Note: if this ensure-nonnegative stuff gets active here it would cause - # the optimization to give the wrong answer, but we've set up the config files - # in such a way that this shouldn't happen. - scale_configs.pl $subdir/config.$[$n-1] $subdir/config.diff_$n $alpha > $config - get_perplexity > $subdir/perplexities/$n.$alpha & - done - wait - optimize_alpha.pl -0.25 `cat $subdir/perplexities/$n.-0.25` \ - 0.0 `cat $subdir/perplexities/$n.0.0` \ - 0.35 `cat $subdir/perplexities/$n.0.35` > $subdir/perplexities/alpha.$n || exit 1; - alpha=`cat $subdir/perplexities/alpha.$n` - echo "Alpha value on iter $n is $alpha" - scale_configs.pl $subdir/config.$[$n-1] $subdir/config.diff_$n $alpha > $subdir/config.$n - done -fi -echo Final config is: -cat $subdir/config.$num_configs - -# Create final LM as discounted (but not interpolated) N-grams: -if gunzip -c $subdir/ngrams_disc.gz >&/dev/null; then - echo "Not creating discounted N-grams file $subdir/ngrams_disc.gz since it already exists" -else - echo "Discounting N-grams." - gunzip -c $subdir/ngrams.gz | \ - discount_ngrams $subdir/config.$num_configs | sort | merge_ngrams | \ - gzip -c > $subdir/ngrams_disc.gz -fi - -echo "Computing final perplexity" -gunzip -c $subdir/ngrams_disc.gz | \ - interpolate_ngrams $dir/wordlist.mapped 0.5 | \ - sort | sort -m <(gunzip -c $subdir/heldout_ngrams.gz) - | \ - compute_perplexity 2>&1 | tee $subdir/perplexity & - - -if [ $write_arpa == 1 ]; then - echo "Building ARPA LM (perplexity computation is in background)" - mkdir -p $subdir/tmpdir - gunzip -c $subdir/ngrams_disc.gz | \ - interpolate_ngrams --arpa $dir/wordlist.mapped 0.5 | \ - sort | finalize_arpa.pl $subdir/tmpdir | \ - map_words_in_arpa.pl $dir/word_map | \ - gzip -c > $subdir/lm_unpruned.gz -fi - diff --git a/egs/timit/s3/local/export_log_fbanks_to_htk.sh b/egs/timit/s3/local/export_log_fbanks_to_htk.sh deleted file mode 100755 index 208f7d2a037..00000000000 --- a/egs/timit/s3/local/export_log_fbanks_to_htk.sh +++ /dev/null @@ -1,49 +0,0 @@ -#!/bin/bash -# Copyright 2012 Navdeep Jaitly - -# This program allows you to export log filterbank data from -# KALDI to HTK format. Also exported is the force alignment -# data, from the gmm alignment. -# HTK files are created, one per input file. -# alignment file: ali is create one for the entire set (test/dev/train). -# Can be used for offline neural network training if you don't use -# the abilities of Kaldi to do so. - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - - - -config=conf/mfcc.conf -data=data -#out_path=/ais/gobi2/ndjaitly/Data/Kaldi/Spectrograms/ -#out_path=/ais/gobi2/ndjaitly/Data/Kaldi/FBANKS/ -out_path=/ais/gobi2/ndjaitly/Data/Kaldi/export/FBANKS_25_10/ -num_mel_bins=40 -power_spectrum_only=0 -frame_length=25 -frame_shift=10 - -#for test in train test dev ; do -for test in test dev ; do - scp=$data/$test/wav.scp - out_dir=$out_path/$test/ - out_scp=$out_path/$test/htk.scp - out_ali=$out_path/$test/ali - mkdir -p $out_dir - cat $scp | awk -v outdir=$out_dir '{ printf $1 " " outdir $1 ".htk\n"; }' > $out_scp - compute-fbank-feats --frame-length=$frame_length --frame-shift=$frame_shift \ - --num-mel-bins=$num_mel_bins --output-format=htk --verbose=2 \ - --config=$config scp:$scp scp:$out_scp - ali-to-pdf exp/mono/final.mdl ark:exp/mono_ali_$test/ali t,ark:- > $out_ali -done diff --git a/egs/timit/s3/local/get_word_map.pl b/egs/timit/s3/local/get_word_map.pl deleted file mode 100755 index fe90ba68a06..00000000000 --- a/egs/timit/s3/local/get_word_map.pl +++ /dev/null @@ -1,65 +0,0 @@ -#!/usr/bin/env perl -# A very small modification on ../../../tools/kaldi_lm/get_word_map.pl to account -# for no OOV vocab terms in timit. - Navdeep Jaitly. - - -# This program reads in a file with one word -# on each line, and outputs a "translation file" of the form: -# word short-form-of-word -# on each line, -# where short-form-of-word is a kind of abbreviation of the word. -# -# It uses the letters a-z and A-Z, plus the characters from -# 128 to 255. The first words in the file have the shortest representation. -# -# For convenience, it makes sure to give , -# a consistent labeling, as A and B respectively. - - -# set up character table and some variables. -@C = (); -foreach $x (ord('A')...ord('Z')) { push @C, chr($x); } -foreach $x (ord('a')...ord('z')) { push @C, chr($x); } -foreach $x(128...254) { push @C, chr($x); } # 255 is space so don't include it. - -@index = ( 2 ); # array of indexes into @C... count up to [dim of C -1] - # then add another index onto this. Set it to 3, since 0 and 1 are - # reserved for and respectively. - -if (@ARGV != 2 && @ARGV != 3) { - die "Usage: get_word_map.pl bos-symbol eos-symbol [words-in-order]\n"; -} - -$bos = shift @ARGV; -$eos = shift @ARGV; -print "$bos $C[0]\n"; -print "$eos $C[1]\n"; - -sub get_short_form(); - -while(<>) { - chop; - $word = $_; - $word =~ m:^\S+$: || die "Bad word $word"; - if($seen{$word}) { die "Word $word repeated"; } - $seen{$word}++; - if ($word ne $bos && $word ne $eos) { - $short_form = get_short_form(); - print "$word $short_form\n"; - } -} - -sub get_short_form() { - $ans = ""; - foreach $i (@index) { $ans = $C[$i] . $ans; } # - # Now increment the index. - $index[0]++; - $cur_idx = 0; - while ($index[$cur_idx] == @C) { # E.g. if the least significant digit - # is out of the valid range... carry one. - $index[$cur_idx] = 0; - $cur_idx++; - $index[$cur_idx]++; # This will extend the array if necessary. - } - return $ans; -} diff --git a/egs/timit/s3/local/make_trans.pl b/egs/timit/s3/local/make_trans.pl deleted file mode 100755 index 230b4fab2bf..00000000000 --- a/egs/timit/s3/local/make_trans.pl +++ /dev/null @@ -1,98 +0,0 @@ -#!/usr/bin/env perl -# Copyright 2012 Navdeep Jaitly. -# Copyright 2010-2011 Microsoft Corporation - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - - - -# usage: make_trans.sh prefix in.flist out.txt out.scp - -# prefix is first letters of the database "key" (rest are numeric) - -# in.flist is just a list of the WAV file paths (X.WAV). The -# monophone transcriptions are in the files (X.phn). -# out.txt is the output transcriptions in format "key word1 word\n" -# out.scp is the output scp file, which is as in.scp but has the -# database-key first on each line. - -# Reads from first argument in.flist -# Writes to standard output trans.txt - -sub ParseTranscript() { - my $transcript_file = $_[0]; - open(F, "<$transcript_file") || die "Error opening phone transcription file $transcript_file\n"; - my $trans = "h#" ; - my $line = ; - chomp ($line); - # first line should be "h#". - ($line =~/h#/) || die "First line should be h#. Got line: $line"; - my @pieces; - while() { - chomp ; - @pieces = split(" ", $_); - @pieces == 3 || die "Error parsing file: $transcript_file, line: $_. Expected 3 fields. Found @pieces"; - $trans = $trans . " " . $pieces[2]; - } - ($pieces[2] =~/^h#/) || die "Last line should be h#"; - #$trans =~s/^h#// ; # first h# - #$trans =~s/h#/<\\s>/ ; # last h# - $trans =~s/^h#// ; # first h# - $trans =~s/h#$// ; # last h# - ($trans !~ m/h#/) || die "Found h# character in transcript, other than start or end."; - - close(F); - return $trans ; -} - -if(@ARGV != 4) { - die "usage: make_trans.sh prefix in.flist out.txt out.scp\n"; -} -($prefix, $in_flist, $out_txt, $out_scp) = @ARGV; - -open(G, "<$in_flist") || die "Opening file list $in_flist"; - -open(O, ">$out_txt") || die "Open output transcription file $out_txt"; - -open(P, ">$out_scp") || die "Open output scp file $out_scp"; - -while() { - my $sph_file = $_ ; - chomp ($sph_file) ; - $_ =~ m:/(\w+)/(\w+)\.WAV\s+$:i || die "bad scp line $_"; - $spkname = $1; - $uttname = $2; - $uttname =~ tr/a-z/A-Z/; - $spkname =~ s/_//g; # remove underscore from spk name to make key nicer. - $key = $prefix . "_" . $spkname . "_" . $uttname; - $key =~ tr/A-Z/a-z/; # Make it all lower case. - # to make the numerical and string-sorted orders the same. - my $transcript_file = substr($_, 0, length($_)-4) . "phn"; - if (! -e $transcript_file ) { - $transcript_file = substr($_, 0, length($_)-4) . "PHN"; - } - if (! -e $transcript_file ) { - print "Transcription file: $transcript_file missing." ; - } - - my $trans = &ParseTranscript($transcript_file); - $trans =~ tr/a-z/A-Z/; # Make it all upper case. - print P "$key $sph_file\n"; - print O "$key $trans\n"; - $n++; -} -close(O) || die "Closing output."; -close(P) || die "Closing output."; - - diff --git a/egs/timit/s3/local/timit_data_prep.sh b/egs/timit/s3/local/timit_data_prep.sh deleted file mode 100755 index ff6754b4a55..00000000000 --- a/egs/timit/s3/local/timit_data_prep.sh +++ /dev/null @@ -1,140 +0,0 @@ -#!/bin/bash -u - -# Copyright 2012 Navdeep Jaitly -# Copyright 2010-2011 Microsoft Corporation - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - -# To be run from one directory above this script. - -# The input is the 3 CDs from the LDC distribution of Resource Management. -# The script's argument is a directory which has three subdirectories: -# rm1_audio1 rm1_audio2 rm2_audio - -# Note: when creating your own data preparation scripts, it's a good idea -# to make sure that the speaker id (if present) is a prefix of the utterance -# id, that the output scp file is sorted on utterance id, and that the -# transcription file is exactly the same length as the scp file and is also -# sorted on utterance id (missing transcriptions should be removed from the -# scp file using e.g. scripts/filter_scp.pl) - -if [ $# != 1 ]; then - echo "Usage: ../../local/timit_data_prep.sh /path/to/TIMIT" - exit 1; -fi - -TIMIT_ROOT=$1 -S3_ROOT=`pwd` -mkdir -p data/local -cd data/local - -lower_case=0 -upper_case=0 -if [ -d $TIMIT_ROOT/TIMIT/TRAIN -a -d $TIMIT_ROOT/TIMIT/TEST ]; - then - upper_case=1 - train_folder=$TIMIT_ROOT/TIMIT/TRAIN - test_folder=$TIMIT_ROOT/TIMIT/TEST - spkr_info_file=$TIMIT_ROOT/TIMIT/DOC/SPKRINFO.TXT -elif [ -d $TIMIT_ROOT/timit/train -a -d $TIMIT_ROOT/timit/test ]; - then - lower_case=1 - train_folder=$TIMIT_ROOT/timit/train - test_folder=$TIMIT_ROOT/timit/test - spkr_info_file=$TIMIT_ROOT/timit/doc/spkrinfo.txt -else - echo "Error: run.sh requires a directory argument (an absolute pathname) that contains TIMIT/TRAIN and TIMIT/TEST or timit/train and timit/test." - exit 1; -fi - - -( - find $train_folder -iname "*.wav" | perl -ane 'if (! m/sa[0-9].wav/i){ print $_ ; }' -) > train_sph.flist - - -# make_trans.pl also creates the utterance id's and the kaldi-format scp file. -$S3_ROOT/local/make_trans.pl trn train_sph.flist train_trans.txt train_sph.scp || exit 1; -mv train_trans.txt tmp; sort -k 1 tmp > train_trans.txt -mv train_sph.scp tmp; sort -k 1 tmp > train_sph.scp -rm tmp - -sph2pipe=`cd $S3_ROOT ; cd ../../..; echo $PWD/tools/sph2pipe_v2.5/sph2pipe` -if [ ! -f $sph2pipe ]; then - echo "Could not find the sph2pipe program at $sph2pipe"; - exit 1; -fi -awk '{printf("%s '$sph2pipe' -f wav %s |\n", $1, $2);}' < train_sph.scp > train_wav.scp - -cat train_wav.scp | perl -ane 'm/^(\w+_(\w+)\w_\w+) / || die; print "$1 $2\n"' > train.utt2spk -cat train.utt2spk | sort -k 2 | $S3_ROOT/scripts/utt2spk_to_spk2utt.pl > train.spk2utt - -echo "Creating coretest set." -test_speakers="mdab0 mwbt0 felc0 mtas1 mwew0 fpas0 mjmp0 mlnt0 fpkt0 mlll0 mtls0 fjlm0 mbpm0 mklt0 fnlp0 mcmj0 mjdh0 fmgd0 mgrt0 mnjm0 fdhc0 mjln0 mpam0 fmld0" -dev_speakers="faks0 fdac1 fjem0 mgwt0 mjar0 mmdb1 mmdm2 mpdf0 fcmh0 fkms0 mbdg0 mbwm0 mcsh0 fadg0" -dev_speakers="${dev_speakers} fdms0 fedw0 mgjf0 mglb0 mrtk0 mtaa0 mtdt0 mthc0 mwjg0 fnmr0 frew0 fsem0 mbns0 mmjr0 mdls0 mdlf0" -dev_speakers="${dev_speakers} mdvc0 mers0 fmah0 fdrw0 mrcs0 mrjm4 fcal1 mmwh0 fjsj0 majc0 mjsw0 mreb0 fgjd0 fjmg0 mroa0 mteb0 mjfc0 mrjr0 fmml0 mrws1" - - -if [ $upper_case == 1 ] ; then - test_speakers=`echo $test_speakers | tr '[:lower:]' '[:upper:]'` - dev_speakers=`echo $dev_speakers | tr '[:lower:]' '[:upper:]'` -fi - -rm -f test_sph.flist -for speaker in $test_speakers ; do -echo -n $speaker " " -( - find $test_folder/*/${speaker} -iname "*.wav" | perl -ane 'if (! m/sa[0-9].wav/i){ print $_ ; }' -) >> test_sph.flist -done -echo "" -num_lines=`wc -l test_sph.flist | awk '{print $1}'` -echo "# of utterances in coretest set = ${num_lines}" - -echo "Creating dev set." -rm -f dev_sph.flist -for speaker in $dev_speakers ; do -echo -n $speaker " " -( - find $test_folder/*/${speaker} -iname "*.wav" | perl -ane 'if (! m/sa[0-9].wav/i){ print $_ ; }' -) >> dev_sph.flist -done -echo "" -num_lines=`wc -l dev_sph.flist | awk '{print $1}'` -echo "# of utterances in dev set = ${num_lines}" - - -# make_trans.pl also creates the utterance id's and the kaldi-format scp file. -for test in test dev ; do - echo "Finalizing ${test}" - $S3_ROOT/local/make_trans.pl ${test} ${test}_sph.flist ${test}_trans.txt ${test}_sph.scp || exit 1; - mv ${test}_trans.txt tmp; sort -k 1 tmp > ${test}_trans.txt - mv ${test}_sph.scp tmp; sort -k 1 tmp > ${test}_sph.scp - rm tmp; - awk '{printf("%s '$sph2pipe' -f wav %s |\n", $1, $2);}' < ${test}_sph.scp > ${test}_wav.scp - - cat ${test}_wav.scp | perl -ane 'm/^(\w+_(\w+)\w_\w+) / || die; print "$1 $2\n"' > ${test}.utt2spk - cat ${test}.utt2spk | sort -k 2 | $S3_ROOT/scripts/utt2spk_to_spk2utt.pl > ${test}.spk2utt -done - - -# Need to set these on the basis of file name first characters. -#grep -v "^;" DOC/SPKRINFO.TXT | awk '{print $1 " " $2 ; } ' | \ -cat $spkr_info_file | \ - perl -ane 'tr/A-Z/a-z/;print;' | grep -v ';' | \ - awk '{print $2$1, $2}' | sort | uniq > spk2gender.map || exit 1; - - -echo timit_data_prep succeeded. diff --git a/egs/timit/s3/local/timit_format_data.sh b/egs/timit/s3/local/timit_format_data.sh deleted file mode 100755 index ba1f5a955f3..00000000000 --- a/egs/timit/s3/local/timit_format_data.sh +++ /dev/null @@ -1,137 +0,0 @@ -#!/bin/bash - -# Copyright 2012 Navdeep Jaitly -# Copyright 2010-2011 Microsoft Corporation - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - -# To be run from one directory above this script. - - - -if [ -f path.sh ]; then . path.sh; fi - -arpa_lm=data/local/lm/biphone/lm_unpruned.gz - -data_list="train test dev" - -for x in lang lang_test $data_list; do - mkdir -p data/$x -done - -# Copy stuff into its final location: - -for x in $data_list; do - cp data/local/$x.spk2utt data/$x/spk2utt || exit 1; - cp data/local/$x.utt2spk data/$x/utt2spk || exit 1; - cp data/local/${x}_wav.scp data/$x/wav.scp || exit 1; - cp data/local/${x}_trans.txt data/$x/text || exit 1; - scripts/filter_scp.pl data/$x/spk2utt data/local/spk2gender.map > data/$x/spk2gender || exit 1; -done - - -scripts/make_words_symtab.pl < data/local/lexicon.txt > data/lang/words.txt -scripts/make_phones_symtab.pl < data/local/lexicon.txt > data/lang/phones.txt -cp data/lang/words.txt data/lang_test/words.txt - -silphones="sil"; # This would in general be a space-separated list of all silence phones. E.g. "sil vn" -# Generate colon-separated lists of silence and non-silence phones. -scripts/silphones.pl data/lang/phones.txt "$silphones" data/lang/silphones.csl \ - data/lang/nonsilphones.csl - -ndisambig=`scripts/add_lex_disambig.pl data/local/lexicon.txt data/local/lexicon_disambig.txt` -ndisambig=$[$ndisambig+1]; # add one disambig symbol for silence in lexicon FST. -scripts/add_disambig.pl data/lang/phones.txt $ndisambig > data/lang_test/phones_disambig.txt -cp data/lang_test/phones_disambig.txt data/lang/ # needed for MMI. - -echo "Creating L.fst" -silprob=0.3 # same prob as word -scripts/make_lexicon_fst.pl data/local/lexicon.txt $silprob sil | \ - fstcompile --isymbols=data/lang/phones.txt --osymbols=data/lang/words.txt \ - --keep_isymbols=false --keep_osymbols=false | \ - fstarcsort --sort_type=olabel > data/lang/L.fst -echo "Done creating L.fst" - - -# L_disambig.fst has the disambiguation symbols (c.f. Mohri's papers) -echo "Creating L_disambig.fst" -scripts/make_lexicon_fst.pl data/local/lexicon_disambig.txt $silprob sil '#'$ndisambig | \ - fstcompile --isymbols=data/lang_test/phones_disambig.txt --osymbols=data/lang_test/words.txt \ - --keep_isymbols=false --keep_osymbols=false | fstarcsort --sort_type=olabel \ - > data/lang_test/L_disambig.fst -echo "Done creating L_disambig.fst" - -cp data/lang_test/L_disambig.fst data/lang/ # Needed for MMI training. -echo "Creating G.fst" - -#gunzip -c "$arpa_lm" | \ -# grep -v ' ' | \ -# grep -v ' ' | \ -# grep -v ' ' | \ -# arpa2fst - | fstprint | \ -# scripts/remove_oovs.pl /dev/null | \ -# scripts/eps2disambig.pl | scripts/s2eps.pl | \ -# fstcompile --isymbols=data/lang/words.txt --osymbols=data/lang_test/words.txt --keep_isymbols=false \ -# --keep_osymbols=false > data/lang_test/G.fst -gunzip -c "$arpa_lm" | \ - grep -v ' ' | \ - grep -v ' ' | \ - grep -v ' ' | \ - arpa2fst - | fstprint | \ - scripts/remove_oovs.pl /dev/null | \ - scripts/s2eps.pl | \ - fstcompile --isymbols=data/lang/words.txt --osymbols=data/lang_test/words.txt --keep_isymbols=false \ - --keep_osymbols=false > data/lang_test/G.fst - -echo "G.fst created. How stochastic is it ?" -fstisstochastic data/lang_test/G.fst - -# Checking that G.fst is determinizable. -fstdeterminize data/lang_test/G.fst /dev/null || echo Error determinizing G. - -# Checking that L_disambig.fst is determinizable. -fstdeterminize data/lang_test/L_disambig.fst /dev/null || echo Error determinizing L. - -# Checking that disambiguated lexicon times G is determinizable -fsttablecompose data/lang_test/L_disambig.fst data/lang_test/G.fst | \ - fstdeterminize >/dev/null || echo Error - -# Checking that LG is stochastic: -echo "How stochastic is LG.fst." -fstisstochastic data/lang_test/G.fst -fsttablecompose data/lang/L.fst data/lang_test/G.fst | \ - fstisstochastic - -# Checking that LG_disambig.fst is stochastic: -echo "How stochastic is LG_disambig.fst." -fsttablecompose data/lang_test/L_disambig.fst data/lang_test/G.fst | \ - fstisstochastic - - -## Check lexicon. -## just have a look and make sure it seems sane. -echo "First few lines of lexicon FST:" -fstprint --isymbols=data/lang/phones.txt --osymbols=data/lang/words.txt data/lang/L.fst | head - - -silphonelist=`cat data/lang/silphones.csl | sed 's/:/ /g'` -nonsilphonelist=`cat data/lang/nonsilphones.csl | sed 's/:/ /g'` -cat conf/topo.proto | sed "s:NONSILENCEPHONES:$nonsilphonelist:" | \ - sed "s:SILENCEPHONES:$silphonelist:" > data/lang/topo - -for x in phones.txt words.txt silphones.csl nonsilphones.csl topo; do - cp data/lang/$x data/lang_test/$x || exit 1; -done - -echo timit_format_data succeeded. diff --git a/egs/timit/s3/local/timit_train_lms.sh b/egs/timit/s3/local/timit_train_lms.sh deleted file mode 100755 index eb61122442d..00000000000 --- a/egs/timit/s3/local/timit_train_lms.sh +++ /dev/null @@ -1,95 +0,0 @@ -#!/bin/bash -# Copyright 2012 Navdeep Jaitly - -# Derived from swbd/s3/local/swbd_p1_train_lms.sh scripts. - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - - -# To be run from one directory above this script. -# This script takes no arguments. It assumes you have already run -# timit_data_prep.sh. -# It takes as input the file -# [argument 1]/train_trans.txt -# and uses it to create the lexicon (just the phones) and the biphone language model. -# Creates folder [argument 1]/lm - -if [ $# != 1 ]; then - echo "Usage: ../../local/timit_train_lms.sh [data path]" - echo "eg: ../../local/timit_train_lms.sh data/local" - exit 1; -fi - - -dir=$1/lm -trans_file=$1/train_trans.txt -phones_file=$1/phones.txt -lex_file=$1/lexicon.txt - -if [ ! -e $trans_file ]; then - echo "Transcript file $trans_file not found. Did you run local/timit_data_prep.sh" - exit 1; -fi - -mkdir -p $dir -export LC_ALL=C # You'll get errors about things being not sorted, if you -# have a different locale. -export PATH=$PATH:`pwd`/../../../tools/kaldi_lm -( # First make sure the kaldi_lm toolkit is installed. - cd ../../../tools || exit 1; - if [ -d kaldi_lm ]; then - echo Not installing the kaldi_lm toolkit since it is already there. - else - echo Downloading and installing the kaldi_lm tools - if [ ! -f kaldi_lm.tar.gz ]; then - wget http://www.danielpovey.com/files/kaldi/kaldi_lm.tar.gz || exit 1; - fi - tar -xvzf kaldi_lm.tar.gz || exit 1; - cd kaldi_lm - make || exit 1; - echo Done making the kaldi_lm tools - fi -) || exit 1; - -mkdir -p $dir - -echo "Creating phones file, and monophone lexicon (mapping phones to itself)." -cat $trans_file | awk '{for(n=2;n<=NF;n++) print $n; }' | sort | uniq | awk '{print tolower($1) ; }' > $phones_file -cat $phones_file | awk '{print toupper($1) " " $1 ; }' > $lex_file -cat $trans_file | awk '{for(n=2;n<=NF;n++) print $n; }' | sort | uniq -c | \ - sort -nr > $dir/word.counts - - -# Get counts from acoustic training transcripts, and add one-count -# for each word in the lexicon. -cat $trans_file | awk '{for(n=2;n<=NF;n++) print $n; }' | \ - cat - <(cat $lex_file | awk '{print $1}') | \ - sort | uniq -c | sort -nr > $dir/unigram.counts - -# note: we probably won't really make use of as there aren't any OOVs -cat $dir/unigram.counts | awk '{print $2}' | local/get_word_map.pl "" "" > $dir/word_map - -# note: ignore 2nd field of train.txt, it's the utterance-id. -cat $trans_file | awk -v wmap=$dir/word_map 'BEGIN{while((getline0)map[$1]=$2;} - { for(n=2;n<=NF;n++) { printf map[$n]; if(n$dir/train.gz - -! merge_ngrams &/dev/null && \ - echo merge_ngrams not found in kaldi_lm. You need to have kaldi_lm on your path OR && \ - echo You can do the following: && \ - echo 1. Install the latest version from http://www.danielpovey.com/files/kaldi/kaldi_lm.tar.gz && \ - echo 2. you delete kaldi_lm, and kaldi_lm.tar.gz in the tools folder. This script will automatically install it. && \ - exit 1; - -echo "Creating biphone model" -local/create_biphone_lm.sh $dir diff --git a/egs/timit/s3/path.sh b/egs/timit/s3/path.sh deleted file mode 100755 index 35e306fa45e..00000000000 --- a/egs/timit/s3/path.sh +++ /dev/null @@ -1,3 +0,0 @@ - -export PATH=$PWD/scripts/:$PWD/../../../src/bin:$PWD/../../../tools/openfst/bin:$PWD/../../../src/fstbin/:$PWD/../../../src/gmmbin/:$PWD/../../../src/featbin/:$PWD/../../../src/lm/:$PWD/../../../src/sgmmbin/:$PWD/../../../src/fgmmbin/:$PWD/../../../src/latbin/:$PWD:$PATH -export LC_ALL=C diff --git a/egs/timit/s3/run.sh b/egs/timit/s3/run.sh deleted file mode 100755 index 97b9973d8c3..00000000000 --- a/egs/timit/s3/run.sh +++ /dev/null @@ -1,55 +0,0 @@ -#!/bin/bash -u - -# Copyright 2012 Navdeep Jaitly - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - -# To be safe we suggest running the recipe line by line. Otherwise -# comment out the following line -exit 1; - -. path.sh -local/timit_data_prep.sh /ais/gobi2/speech/TIMIT || exit 1; -# local/timit_data_prep.sh /export/corpora5/LDC/LDC93S1 || exit 1; -local/timit_train_lms.sh data/local || exit 1 ; -local/timit_format_data.sh || exit 1; - -# mfccdir should be some place with a largish disk where you -# want to store MFCC features. -mfccdir=mfccs - -for test in train test dev ; do - steps/make_mfcc.sh data/$test exp/make_mfcc/$test $mfccdir 4 -done - -# train monophone system. -steps/train_mono.sh data/train data/lang exp/mono || exit 1; - -scripts/mkgraph.sh --mono data/lang_test exp/mono exp/mono/graph || exit 1; -echo "Decoding test datasets." -for test in dev test ; do - steps/decode_deltas.sh exp/mono data/$test data/lang exp/mono/decode_$test & -done -wait -scripts/average_wer.sh exp/mono/decode_*/wer > exp/mono/wer || exit 1; - -# Get alignments from monophone system. -echo "Creating training alignments to use to train other systems such as ANN-HMM." -steps/align_deltas.sh data/train data/lang exp/mono exp/mono_ali_train || exit 1; -echo "Creating dev alignments to use to train other systems such as ANN-HMM." -steps/align_deltas.sh data/dev data/lang exp/mono exp/mono_ali_dev || exit 1; -echo "Creating test alignments to use to train other systems such as ANN-HMM." -steps/align_deltas.sh data/test data/lang exp/mono exp/mono_ali_test || exit 1; - - diff --git a/egs/timit/s3/scripts/add_disambig.pl b/egs/timit/s3/scripts/add_disambig.pl deleted file mode 100755 index 9036b484e29..00000000000 --- a/egs/timit/s3/scripts/add_disambig.pl +++ /dev/null @@ -1,58 +0,0 @@ -#!/usr/bin/env perl -# Copyright 2010-2011 Microsoft Corporation - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - - -# Adds some specified number of disambig symbols to a symbol table. -# Adds these as #1, #2, etc. -# If the --include-zero option is specified, includes an extra one -# #0. -if(!(@ARGV == 2 || (@ARGV ==3 && $ARGV[0] eq "--include-zero"))) { - die "Usage: add_disambig.pl [--include-zero] symtab.txt num_extra > symtab_out.txt "; -} - -if(@ARGV == 3) { - $include_zero = 1; - $ARGV[0] eq "--include-zero" || die "Bad option/first argument $ARGV[0]"; - shift @ARGV; -} else { - $include_zero = 0; -} - -$input = $ARGV[0]; -$nsyms = $ARGV[1]; - -open(F, "<$input") || die "Opening file $input"; - -while() { - @A = split(" ", $_); - @A == 2 || die "Bad line $_"; - $lastsym = $A[1]; - print; -} - -if(!defined($lastsym)){ - die "Empty symbol file?"; -} - -if($include_zero) { - $lastsym++; - print "#0 $lastsym\n"; -} - -for($n = 1; $n <= $nsyms; $n++) { - $y = $n + $lastsym; - print "#$n $y\n"; -} diff --git a/egs/timit/s3/scripts/add_lex_disambig.pl b/egs/timit/s3/scripts/add_lex_disambig.pl deleted file mode 100755 index 86d96848c97..00000000000 --- a/egs/timit/s3/scripts/add_lex_disambig.pl +++ /dev/null @@ -1,101 +0,0 @@ -#!/usr/bin/env perl -# Copyright 2010-2011 Microsoft Corporation - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - - -# Adds disambiguation symbols to a lexicon. -# Outputs still in the normal lexicon format. -# Disambig syms are numbered #1, #2, #3, etc. (#0 -# reserved for symbol in grammar). -# Outputs the number of disambig syms to the standard output. - -if(@ARGV != 2) { - die "Usage: add_lex_disambig.pl [ --sil silphone ] lexicon.txt lexicon_disambig.txt " -} - - -$lexfn = shift @ARGV; -$lexoutfn = shift @ARGV; - -open(L, "<$lexfn") || die "Error opening lexicon $lexfn"; - -# (1) Read in the lexicon. -@L = ( ); -while() { - @A = split(" ", $_); - push @L, join(" ", @A); -} - -# (2) Work out the count of each phone-sequence in the -# lexicon. - -foreach $l (@L) { - @A = split(" ", $l); - shift @A; # Remove word. - $count{join(" ",@A)}++; -} - -# (3) For each left sub-sequence of each phone-sequence, note down -# that exists (for identifying prefixes of longer strings). - -foreach $l (@L) { - @A = split(" ", $l); - shift @A; # Remove word. - while(@A > 0) { - pop @A; # Remove last phone - $issubseq{join(" ",@A)} = 1; - } -} - -# (4) For each entry in the lexicon: -# if the phone sequence is unique and is not a -# prefix of another word, no diambig symbol. -# Else output #1, or #2, #3, ... if the same phone-seq -# has already been assigned a disambig symbol. - - -open(O, ">$lexoutfn") || die "Opening lexicon file $lexoutfn for writing.\n"; - -$max_disambig = 0; -foreach $l (@L) { - @A = split(" ", $l); - $word = shift @A; - $phnseq = join(" ",@A); - if(!defined $issubseq{$phnseq} - && $count{$phnseq}==1) { - ; # Do nothing. - } else { - if($phnseq eq "") { # need disambig symbols for the empty string - # that are not used anywhere else. - $max_disambig++; - $reserved{$max_disambig} = 1; - $phnseq = "#$max_disambig"; - } else { - $curnumber = $disambig_of{$phnseq}; - if(!defined{$curnumber}) { $curnumber = 0; } - $curnumber++; # now 1 or 2, ... - while(defined $reserved{$curnumber} ) { $curnumber++; } # skip over reserved symbols - if($curnumber > $max_disambig) { - $max_disambig = $curnumber; - } - $disambig_of{$phnseq} = $curnumber; - $phnseq = $phnseq . " #" . $curnumber; - } - } - print O "$word\t$phnseq\n"; -} - -print $max_disambig . "\n"; - diff --git a/egs/timit/s3/scripts/average_wer.sh b/egs/timit/s3/scripts/average_wer.sh deleted file mode 100755 index a2c9c35109d..00000000000 --- a/egs/timit/s3/scripts/average_wer.sh +++ /dev/null @@ -1,21 +0,0 @@ -#!/bin/bash -# -# Copyright 2010-2011 Microsoft Corporation - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - -# To be run from one directory above this script. - -grep WER $* | \ - awk '{n=n+$4; d=d+$6} END{ printf("Average WER is %f (%d / %d) \n", (100.0*n)/d, n, d); }' diff --git a/egs/timit/s3/scripts/collapse_phones.pl b/egs/timit/s3/scripts/collapse_phones.pl deleted file mode 100755 index f2126a48882..00000000000 --- a/egs/timit/s3/scripts/collapse_phones.pl +++ /dev/null @@ -1,63 +0,0 @@ -#!/usr/bin/env perl -use strict ; - -my $ignore_first_field = 0; -if($ARGV[0] eq "--ignore-first-field") { $ignore_first_field = 1; shift @ARGV; } - -my $symtab = shift @ARGV; - -if(!defined $symtab) { - die "Usage: collapse_phones.pl --ignore-first-field symtab [phoneme mapping] > output transcriptions\n"; -} - -my $mapping_str = shift @ARGV; -if(!defined $mapping_str) { - die "Usage: collapse_phones.pl --ignore-first-field symtab [phoneme mapping] > output transcriptions\n"; -} - -my %mapping; -my @parts = split(",", $mapping_str); -for my $part (@parts) { - my ($from, $to) = split(":", $part); - $mapping{uc($from)} = uc($to) ; -} - -my %sym2int ; -open(F, "<$symtab") || die "Error opening symbol table file $symtab"; -while() { - my @A = split(" ", $_); - @A == 2 || die "bad line in symbol table file: $_"; - $sym2int{$A[0]} = $A[1] + 0; -} - -# change the mappings. -my %int2int ; -foreach my $key (keys %sym2int) { - my $value = $sym2int{$key} ; - if (exists($mapping{$key})) { - $int2int{$value} = $sym2int{$mapping{$key}} ; - } else { - $int2int{$value} = $value ; - } -} - -while(<>) { - my @A = split(" ", $_); - if(@A == 0) { - die "Empty line in transcriptions input."; - } - if($ignore_first_field) { - my $key = shift @A; - print $key . " "; - } - foreach $a (@A) { - my $i = $int2int{$a}; - if(!defined ($i)) { - die "collapse_phones.pl: undefined symbol $a\n"; - } - print $i . " "; - } - print "\n"; -} - - diff --git a/egs/timit/s3/scripts/eps2disambig.pl b/egs/timit/s3/scripts/eps2disambig.pl deleted file mode 100755 index 049802b0888..00000000000 --- a/egs/timit/s3/scripts/eps2disambig.pl +++ /dev/null @@ -1,23 +0,0 @@ -#!/usr/bin/env perl -# Copyright 2010-2011 Microsoft Corporation - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - -# This script replaces epsilon with #0 on the input side only, of the G.fst -# acceptor. - -while(<>){ - s:^(\d+\s+\d+\s+)\(\s+):$1#0$2:; - print; -} diff --git a/egs/timit/s3/scripts/filter_scp.pl b/egs/timit/s3/scripts/filter_scp.pl deleted file mode 100755 index c60b9800f84..00000000000 --- a/egs/timit/s3/scripts/filter_scp.pl +++ /dev/null @@ -1,41 +0,0 @@ -#!/usr/bin/env perl -use warnings; #sed replacement for -w perl parameter -# Copyright 2010-2011 Microsoft Corporation - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - - -# This script takes a list of utterance-ids and filters an scp -# file (or any file whose first field is an utterance id), printing -# out only those lines whose first field is in id_list. - -if(@ARGV < 1 || @ARGV > 2) { - die "Usage: filter_scp.pl id_list [in.scp] > out.scp "; -} - -$idlist = shift @ARGV; -open(F, "<$idlist") || die "Could not open id-list file $idlist"; -while() { - @A = split; - @A>=1 || die "Invalid id-list file line $_"; - $seen{$A[0]} = 1; -} - -while(<>) { - @A = split; - @A > 0 || die "Invalid scp file line $_"; - if($seen{$A[0]}) { - print $_; - } -} diff --git a/egs/timit/s3/scripts/int2sym.pl b/egs/timit/s3/scripts/int2sym.pl deleted file mode 100755 index ad85ef34993..00000000000 --- a/egs/timit/s3/scripts/int2sym.pl +++ /dev/null @@ -1,90 +0,0 @@ -#!/usr/bin/env perl -# Copyright 2010-2011 Microsoft Corporation - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - - -$ignore_noninteger = 0; -$ignore_first_field = 0; -$field = -1; -for($x = 0; $x < 2; $x++) { - if($ARGV[0] eq "--ignore-noninteger") { $ignore_noninteger = 1; shift @ARGV; } - if($ARGV[0] eq "--ignore-first-field") { $ignore_first_field = 1; shift @ARGV; } - if($ARGV[0] eq "--field") { - shift @ARGV; $field = $ARGV[0]+0; shift @ARGV; - if ($field < 1) { die "Bad argument to --field option: $field"; } - } -} - -if ($ignore_first_field && $field > 0) { die "Incompatible options ignore-first-field and field"; } -$zfield = $field-1; # Change to zero-based indexing. - -$symtab = shift @ARGV; -if(!defined $symtab) { - die "Usage: sym2int.pl symtab [input] > output\n"; -} -open(F, "<$symtab") || die "Error opening symbol table file $symtab"; -while() { - @A = split(" ", $_); - @A == 2 || die "bad line in symbol table file: $_"; - $int2sym{$A[1]} = $A[0]; -} - -sub int2sym { - my $a = shift @_; - my $pos = shift @_; - if($a !~ m:^\d+$:) { # not all digits.. - if($ignore_noninteger) { - print $a . " "; - next; - } else { - if($pos == 0) { - die "int2sym.pl: found noninteger token $a (try --ignore-first-field)\n"; - } else { - die "int2sym.pl: found noninteger token $a (try --ignore-noninteger if valid input)\n"; - } - } - } - $s = $int2sym{$a}; - if(!defined ($s)) { - die "int2sym.pl: integer $a not in symbol table $symtab."; - } - return $s; -} - -$error = 0; -while(<>) { - @A = split(" ", $_); - if($ignore_first_field) { - $key = shift @A; - print $key . " "; - } - if ($field != -1) { - if ($zfield <= $#A && $zfield >= 0) { - $a = $A[$zfield]; - $A[$zfield] = int2sym($a, $zfield); - } - print join(" ", @A); - } else { - for ($pos = 0; $pos <= $#A; $pos++) { - $a = $A[$pos]; - $s = int2sym($a, $pos); - print $s . " "; - } - } - print "\n"; -} - - - diff --git a/egs/timit/s3/scripts/is_sorted.sh b/egs/timit/s3/scripts/is_sorted.sh deleted file mode 100755 index ac6ae42e74e..00000000000 --- a/egs/timit/s3/scripts/is_sorted.sh +++ /dev/null @@ -1,45 +0,0 @@ -#!/bin/bash -# Copyright 2010-2011 Microsoft Corporation - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - - -# Usage: is_sorted.sh [script-file] -# This script returns 0 (success) if the script file argument [or standard input] -# is sorted and 1 otherwise. - -export LC_ALL=C - -if [ $# == 0 ]; then - scp=- -fi -if [ $# == 1 ]; then - scp=$1 -fi -if [ $# -gt 1 -o "$1" == "--help" -o "$1" == "-h" ]; then - echo "Usage: is_sorted.sh [script-file]" - exit 1 -fi - -cat $scp > /tmp/tmp1.$$ -sort /tmp/tmp1.$$ > /tmp/tmp2.$$ -cmp /tmp/tmp1.$$ /tmp/tmp2.$$ >/dev/null -ret=$? -rm /tmp/tmp1.$$ /tmp/tmp2.$$ -if [ $ret == 0 ]; then - exit 0; -else - echo "is_sorted.sh: script file $scp is not sorted"; - exit 1; -fi diff --git a/egs/timit/s3/scripts/make_lexicon_fst.pl b/egs/timit/s3/scripts/make_lexicon_fst.pl deleted file mode 100755 index ada17f64e11..00000000000 --- a/egs/timit/s3/scripts/make_lexicon_fst.pl +++ /dev/null @@ -1,122 +0,0 @@ -#!/usr/bin/env perl -# Copyright 2010-2011 Microsoft Corporation - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - - -# makes lexicon FST (no pron-probs involved). - -if(@ARGV != 1 && @ARGV != 3 && @ARGV != 4) { - die "Usage: make_lexicon_fst.pl lexicon.txt [silprob silphone [sil_disambig_sym]] lexiconfst.txt" -} - -$lexfn = shift @ARGV; -if(@ARGV == 0) { - $silprob = 0.0; -} elsif (@ARGV == 2){ - ($silprob,$silphone) = @ARGV; -} else { - ($silprob,$silphone,$sildisambig) = @ARGV; -} -if($silprob != 0.0) { - $silprob < 1.0 || die "Sil prob cannot be >= 1.0"; - $silcost = -log($silprob); - $nosilcost = -log(1.0 - $silprob); -} - - -open(L, "<$lexfn") || die "Error opening lexicon $lexfn"; - - - -if( $silprob == 0.0 ) { # No optional silences: just have one (loop+final) state which is numbered zero. - $loopstate = 0; - $nextstate = 1; # next unallocated state. - while() { - @A = split(" ", $_); - $w = shift @A; - if(@A == 0) { # For empty words ( and ) insert no optional - # silence (not needed as adjacent words supply it).... - # actually we only hit this case for the lexicon without disambig - # symbols but doesn't ever matter as training transcripts don't have or . - print "$loopstate\t$loopstate\t\t$w\n"; - } else { - $s = $loopstate; - $word_or_eps = $w; - while (@A > 0) { - $p = shift @A; - if(@A > 0) { - $ns = $nextstate++; - } else { - $ns = $loopstate; - } - print "$s\t$ns\t$p\t$word_or_eps\n"; - $word_or_eps = ""; - $s = $ns; - } - } - } - print "$loopstate\t0\n"; # final-cost. -} else { # have silence probs. - $startstate = 0; - $loopstate = 1; - $silstate = 2; # state from where we go to loopstate after emitting silence. - print "$startstate\t$loopstate\t\t\t$nosilcost\n"; # no silence. - if (!defined $sildisambig) { - print "$startstate\t$loopstate\t$silphone\t\t$silcost\n"; # silence. - print "$silstate\t$loopstate\t$silphone\t\n"; # no cost. - $nextstate = 3; - } else { - $disambigstate = 3; - $nextstate = 4; - print "$startstate\t$disambigstate\t$silphone\t\t$silcost\n"; # silence. - print "$silstate\t$disambigstate\t$silphone\t\n"; # no cost. - print "$disambigstate\t$loopstate\t$sildisambig\t\n"; # silence disambiguation symbol. - } - while() { - @A = split(" ", $_); - $w = shift @A; - if(@A == 0) { # For empty words ( and ) insert no optional - # silence (not needed as adjacent words supply it).... - # actually we only hit this case for the lexicon without disambig - # symbols but doesn't ever matter as training transcripts don't have or . - print "$loopstate\t$loopstate\t\t$w\n"; - } else { - $is_silence_word = (@A == 1 && $A[0] eq $silphone); # boolean. - $s = $loopstate; - $word_or_eps = $w; - while (@A > 0) { - $p = shift @A; - if(@A > 0) { - $ns = $nextstate++; - print "$s\t$ns\t$p\t$word_or_eps\n"; - $word_or_eps = ""; - $s = $ns; - } else { - if(! $is_silence_word) { - # This is non-deterministic but relatively compact, - # and avoids epsilons. - print "$s\t$loopstate\t$p\t$word_or_eps\t$nosilcost\n"; - print "$s\t$silstate\t$p\t$word_or_eps\t$silcost\n"; - } else { - # no point putting opt-sil after silence word. - print "$s\t$loopstate\t$p\t$word_or_eps\n"; - } - $word_or_eps = ""; - } - } - } - } - print "$loopstate\t0\n"; # final-cost. -} diff --git a/egs/timit/s3/scripts/make_phones_symtab.pl b/egs/timit/s3/scripts/make_phones_symtab.pl deleted file mode 100755 index 03b8cbe7af3..00000000000 --- a/egs/timit/s3/scripts/make_phones_symtab.pl +++ /dev/null @@ -1,37 +0,0 @@ -#!/usr/bin/env perl -# Copyright 2010-2011 Microsoft Corporation - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - -# make_phones_symtab.pl < lexicon.txt > phones.txt - - -while(<>) { - @A = split(" ", $_); - for ($i=1; $i<@A; $i++) { - $P{$A[$i]} = 1; # seen it. - } -} - -print "\t0\n"; -$n = 1; -foreach $p (sort keys %P) { - if($p ne "") { - print "$p\t$n\n"; - $n++; - } -} - -print "sil\t$n\n"; - diff --git a/egs/timit/s3/scripts/make_rm_dict.pl b/egs/timit/s3/scripts/make_rm_dict.pl deleted file mode 100755 index 8aee98e7481..00000000000 --- a/egs/timit/s3/scripts/make_rm_dict.pl +++ /dev/null @@ -1,130 +0,0 @@ -#!/usr/bin/env perl -# Copyright 2010-2011 Yanmin Qian Microsoft Corporation - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - -# This file takes as input the file pcdsril.txt that comes with the RM -# distribution, and creates the dictionary used in RM training. - -# make_rm_dct.pl pcdsril.txt > dct.txt - -if (@ARGV != 1) { - die "usage: make_rm_dct.pl pcdsril.txt > dct.txt\n"; -} -unless (open(IN_FILE, "@ARGV[0]")) { - die ("can't open @ARGV[0]"); -} - -while ($line = ) -{ - chop($line); - if (($line =~ /^[a-z]/)) - { - $line =~ s/\+1//g; - @LineArray = split(/\s+/,$line); - @LineArray[0] = uc(@LineArray[0]); - - printf "%-16s", @LineArray[0]; - for ($i = 1; $i < @LineArray; $i ++) - { - if (@LineArray[$i] eq 'q') - {} - elsif (@LineArray[$i] eq 'zh') - { - printf "sh "; - } - elsif (@LineArray[$i] eq 'eng') - { - printf "ng "; - } - elsif (@LineArray[$i] eq 'hv') - { - printf "hh "; - } - elsif (@LineArray[$i] eq 'em') - { - printf "m "; - } - elsif (@LineArray[$i] eq 'axr') - { - printf "er "; - } - elsif (@LineArray[$i] eq 'tcl') - { - if (@LineArray[$i+1] ne 't') - { - printf "td "; - } - } - elsif (@LineArray[$i] eq 'dcl') - { - if (@LineArray[$i+1] ne 'd') - { - printf "dd "; - } - } - elsif (@LineArray[$i] eq 'kcl') - { - if (@LineArray[$i+1] ne 'k') - { - printf "kd "; - } - } - elsif (@LineArray[$i] eq 'pcl') - { - if (@LineArray[$i+1] ne 'p') - { - printf "pd "; - } - } - elsif (@LineArray[$i] eq 'bcl') - { - if (@LineArray[$i+1] ne 'b') - { - printf "b "; - } - } - elsif (@LineArray[$i] eq 'gcl') - { - if (@LineArray[$i+1] ne 'g') - { - printf "g "; - } - } - elsif (@LineArray[$i] eq 't') - { - if (@LineArray[$i+1] ne 's') - { - printf "@LineArray[$i] "; - } - else - { - printf "ts "; - $i++; - } - } - else - { - printf "@LineArray[$i] "; - } - } - printf "\n"; - } -} - -printf "!SIL sil\n"; - -close(IN_FILE); - - diff --git a/egs/timit/s3/scripts/make_rm_lm.pl b/egs/timit/s3/scripts/make_rm_lm.pl deleted file mode 100755 index 053fb294329..00000000000 --- a/egs/timit/s3/scripts/make_rm_lm.pl +++ /dev/null @@ -1,119 +0,0 @@ -#!/usr/bin/env perl - -# Copyright 2010-2011 Yanmin Qian Microsoft Corporation - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - -# This file takes as input the file wp_gram.txt that comes with the RM -# distribution, and creates the language model as an acceptor in FST form. - -# make_rm_lm.pl wp_gram.txt > G.txt - -if (@ARGV != 1) { - print "usage: make_rm_lm.pl wp_gram.txt > G.txt\n"; - exit(0); -} -unless (open(IN_FILE, "@ARGV[0]")) { - die ("can't open @ARGV[0]"); -} - - -$flag = 0; -$count_wrd = 0; -$cnt_ends = 0; -$init = ""; - -while ($line = ) -{ - chop($line); - - $line =~ s/ //g; - - if(($line =~ /^>/)) - { - if($flag == 0) - { - $flag = 1; - } - $line =~ s/>//g; - $hashcnt{$init} = $i; - $init = $line; - $i = 0; - $count_wrd++; - @LineArray[$count_wrd - 1] = $init; - $hashwrd{$init} = 0; - } - elsif($flag != 0) - { - - $hash{$init}[$i] = $line; - $i++; - if($line =~ /SENTENCE-END/) - { - $cnt_ends++; - } - } - else - {} -} - -$hashcnt{$init} = $i; - -$num = 0; -$weight = 0; -$init_wrd = "SENTENCE-END"; -$hashwrd{$init_wrd} = @LineArray; -for($i = 0; $i < $hashcnt{$init_wrd}; $i++) -{ - $weight = -log(1/$hashcnt{$init_wrd}); - $hashwrd{$hash{$init_wrd}[$i]} = $i + 1; - print "0 $hashwrd{$hash{$init_wrd}[$i]} $hash{$init_wrd}[$i] $hash{$init_wrd}[$i] $weight\n"; -} -$num = $i; - -for($i = 0; $i < @LineArray; $i++) -{ - if(@LineArray[$i] eq 'SENTENCE-END') - {} - else - { - if($hashwrd{@LineArray[$i]} == 0) - { - $num++; - $hashwrd{@LineArray[$i]} = $num; - } - for($j = 0; $j < $hashcnt{@LineArray[$i]}; $j++) - { - $weight = -log(1/$hashcnt{@LineArray[$i]}); - if($hashwrd{$hash{@LineArray[$i]}[$j]} == 0) - { - $num++; - $hashwrd{$hash{@LineArray[$i]}[$j]} = $num; - } - if($hash{@LineArray[$i]}[$j] eq 'SENTENCE-END') - { - print "$hashwrd{@LineArray[$i]} $hashwrd{$hash{@LineArray[$i]}[$j]} $weight\n" - } - else - { - print "$hashwrd{@LineArray[$i]} $hashwrd{$hash{@LineArray[$i]}[$j]} $hash{@LineArray[$i]}[$j] $hash{@LineArray[$i]}[$j] $weight\n"; - } - } - } -} - -print "$hashwrd{$init_wrd} 0\n"; -close(IN_FILE); - - diff --git a/egs/timit/s3/scripts/make_roots.pl b/egs/timit/s3/scripts/make_roots.pl deleted file mode 100755 index 07c224379b6..00000000000 --- a/egs/timit/s3/scripts/make_roots.pl +++ /dev/null @@ -1,102 +0,0 @@ -#!/usr/bin/env perl -# Copyright 2010-2011 Microsoft Corporation - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - - -# Written by Dan Povey 9/21/2010. Apache 2.0 License. - -# This version of make_roots.pl is specialized for RM. - -# This script creates the file roots.txt which is an input to train-tree.cc. It -# specifies how the trees are built. The input file phone-sets.txt is a partial -# version of roots.txt in which phones are represented by their spelled form, not -# their symbol id's. E.g. at input, phone-sets.txt might contain; -# shared not-split sil -# Any phones not specified in phone-sets.txt but present in phones.txt will -# be given a default treatment. If the --separate option is given, we create -# a separate tree root for each of them, otherwise they are all lumped in one set. -# The arguments shared|not-shared and split|not-split are needed if any -# phones are not specified in phone-sets.txt. What they mean is as follows: -# if shared=="shared" then we share the tree-root between different HMM-positions -# (0,1,2). If split=="split" then we actually do decision tree splitting on -# that root, otherwise we forbid decision-tree splitting. (The main reason we might -# set this to false is for silence when -# we want to ensure that the HMM-positions will remain with a single PDF id. - - -$separate = 0; -if($ARGV[0] eq "--separate") { - $separate = 1; - shift @ARGV; -} - -if(@ARGV != 4) { - die "Usage: make_roots.pl [--separate] phones.txt silence-phone-list[integer,colon-separated] shared|not-shared split|not-split > roots.txt\n"; -} - - -($phonesfile, $silphones, $shared, $split) = @ARGV; -if($shared ne "shared" && $shared ne "not-shared") { - die "Third argument must be \"shared\" or \"not-shared\"\n"; -} -if($split ne "split" && $split ne "not-split") { - die "Third argument must be \"split\" or \"not-split\"\n"; -} - - - -open(F, "<$phonesfile") || die "Opening file $phonesfile"; - -while() { - @A = split(" ", $_); - if(@A != 2) { - die "Bad line in phones symbol file: ".$_; - } - if($A[1] != 0) { - $symbol2id{$A[0]} = $A[1]; - $id2symbol{$A[1]} = $A[0]; - } -} - -if($silphones == ""){ - die "Empty silence phone list in make_roots.pl"; -} -foreach $silphoneid (split(":", $silphones)) { - defined $id2symbol{$silphoneid} || die "No such silence phone id $silphoneid"; - # Give each silence phone its own separate pdfs in each state, but - # no sharing (in this recipe; WSJ is different.. in this recipe there - #is only one silence phone anyway.) - $issil{$silphoneid} = 1; - print "not-shared not-split $silphoneid\n"; -} - -$idlist = ""; -$remaining_phones = ""; - -if($separate){ - foreach $a (keys %id2symbol) { - if(!defined $issil{$a}) { - print "$shared $split $a\n"; - } - } -} else { - print "$shared $split "; - foreach $a (keys %id2symbol) { - if(!defined $issil{$a}) { - print "$a "; - } - } - print "\n"; -} diff --git a/egs/timit/s3/scripts/make_words_symtab.pl b/egs/timit/s3/scripts/make_words_symtab.pl deleted file mode 100755 index 509078898fc..00000000000 --- a/egs/timit/s3/scripts/make_words_symtab.pl +++ /dev/null @@ -1,35 +0,0 @@ -#!/usr/bin/env perl -# Copyright 2010-2011 Microsoft Corporation - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - -# make_words_symtab.pl < lexicon.txt > words.txt - - -while(<>) { - @A = split(" ", $_); - $W{$A[0]} = 1; -} - -print "\t0\n"; -$n = 1; -foreach $w (sort keys %W) { - if($w ne "") { - print "$w\t$n\n"; - $n++; - } -} - -print "!SIL\t$n\n"; - diff --git a/egs/timit/s3/scripts/mkgraph.sh b/egs/timit/s3/scripts/mkgraph.sh deleted file mode 100755 index e7d3fbe6b19..00000000000 --- a/egs/timit/s3/scripts/mkgraph.sh +++ /dev/null @@ -1,107 +0,0 @@ -#!/bin/bash -# Copyright 2010-2011 Microsoft Corporation - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - - -N=3 -P=1 -clean=false - -for x in 1 2 3; do - if [ $1 == "--mono" ]; then - N=1; - P=0; - shift; - fi - if [ $1 == "--clean" ]; then - clean=true - shift; - fi - -done - -if [ $# != 3 ]; then - echo "Usage: scripts/mkgraph.sh " - echo "e.g.: scripts/mkgraph.sh data/lang_test exp/tri1/ exp/tri1/graph" - exit 1; -fi - -if [ -f path.sh ]; then . path.sh; fi - -lang=$1 -tree=$2/tree -model=$2/final.mdl -dir=$3 - -if $clean; then rm -r $lang/tmp; fi - -mkdir -p $dir - -tscale=1.0 -loopscale=0.1 - -# If $lang/tmp/LG.fst does not exist or is older than its sources, make it... -# (note: the [[ ]] brackets make the || type operators work (inside [ ], we -# would have to use -o instead), -f means file exists, and -ot means older than). - -mkdir -p $lang/tmp -if [[ ! -f $lang/tmp/LG.fst || $lang/tmp/LG.fst -ot $lang/G.fst || \ - $lang/tmp/LG.fst -ot $lang/L_disambig.fst ]]; then - fsttablecompose $lang/L_disambig.fst $lang/G.fst | fstdeterminizestar --use-log=true | \ - fstminimizeencoded > $lang/tmp/LG.fst || exit 1; - fstisstochastic $lang/tmp/LG.fst || echo "warning: LG not stochastic." -fi - -if [ ! -f $lang/phones_disambig.txt ]; then - echo "No such file $lang/phones_disambig.txt (supplied a training lang/ directory?)" - exit 1; -fi - -grep '#' $lang/phones_disambig.txt | awk '{print $2}' > $lang/tmp/disambig_phones.list - - -clg=$lang/tmp/CLG_${N}_${P}.fst - -if [[ ! -f $clg || $clg -ot $lang/tmp/LG.fst ]]; then - fstcomposecontext --context-size=$N --central-position=$P \ - --read-disambig-syms=$lang/tmp/disambig_phones.list \ - --write-disambig-syms=$lang/tmp/disambig_ilabels_${N}_${P}.list \ - $lang/tmp/ilabels_${N}_${P} < $lang/tmp/LG.fst >$clg - fstisstochastic $clg || echo "warning: CLG not stochastic." -fi - -if [[ ! -f $dir/Ha.fst || $dir/Ha.fst -ot $model ]]; then - make-h-transducer --disambig-syms-out=$dir/disambig_tid.list \ - --transition-scale=$tscale $lang/tmp/ilabels_${N}_${P} $tree $model \ - > $dir/Ha.fst || exit 1; -fi - -if [[ ! -f $dir/HCLGa.fst || $dir/HCLGa.fst -ot $dir/Ha.fst || \ - $dir/HCLGa.fst -ot $clg ]]; then - fsttablecompose $dir/Ha.fst $clg | fstdeterminizestar --use-log=true \ - | fstrmsymbols $dir/disambig_tid.list | fstrmepslocal | \ - fstminimizeencoded > $dir/HCLGa.fst || exit 1; - fstisstochastic $dir/HCLGa.fst || echo "HCLGa is not stochastic" -fi - -if [[ ! -f $dir/HCLG.fst || $dir/HCLG.fst -ot $dir/HCLGa.fst ]]; then - add-self-loops --self-loop-scale=$loopscale --reorder=true \ - $model < $dir/HCLGa.fst > $dir/HCLG.fst || exit 1; - - if [ $tscale == 1.0 -a $loopscale == 1.0 ]; then - # No point doing this test if transition-scale not 1, as it is bound to fail. - fstisstochastic $dir/HCLG.fst || echo "Final HCLG is not stochastic." - fi -fi diff --git a/egs/timit/s3/scripts/remove_oovs.pl b/egs/timit/s3/scripts/remove_oovs.pl deleted file mode 100755 index 532d7f295ea..00000000000 --- a/egs/timit/s3/scripts/remove_oovs.pl +++ /dev/null @@ -1,43 +0,0 @@ -#!/usr/bin/env perl -# Copyright 2010-2011 Microsoft Corporation - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - -# This script removes lines that contain these OOVs on either the -# third or fourth fields of the line. It is intended to remove arcs -# with OOVs on, from FSTs (probably compiled from ARPAs with OOVs in). - -if ( @ARGV < 1 && @ARGV > 2) { - die "Usage: remove_oovs.pl unk_list.txt [ printed-fst ]\n"; -} - -$unklist = shift @ARGV; -open(S, "<$unklist") || die "Failed opening unknown-symbol list $unklist\n"; -while(){ - @A = split(" ", $_); - @A == 1 || die "Bad line in unknown-symbol list: $_"; - $unk{$A[0]} = 1; -} - -$num_removed = 0; -while(<>){ - @A = split(" ", $_); - if(defined $unk{$A[2]} || defined $unk{$A[3]}) { - $num_removed++; - } else { - print; - } -} -print STDERR "remove_oovs.pl: removed $num_removed lines.\n"; - diff --git a/egs/timit/s3/scripts/s2eps.pl b/egs/timit/s3/scripts/s2eps.pl deleted file mode 100755 index ffeeb8eb6af..00000000000 --- a/egs/timit/s3/scripts/s2eps.pl +++ /dev/null @@ -1,27 +0,0 @@ -#!/usr/bin/env perl -# Copyright 2010-2011 Microsoft Corporation - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - -# This script replaces and with (on both input and output sides), -# for the G.fst acceptor. - -while(<>){ - @A = split(" ", $_); - if ( @A >= 4 ) { - if ($A[2] eq "" || $A[2] eq "") { $A[2] = ""; } - if ($A[3] eq "" || $A[3] eq "") { $A[3] = ""; } - } - print join("\t", @A) . "\n"; -} diff --git a/egs/timit/s3/scripts/silphones.pl b/egs/timit/s3/scripts/silphones.pl deleted file mode 100755 index 3ff85dfe3bb..00000000000 --- a/egs/timit/s3/scripts/silphones.pl +++ /dev/null @@ -1,57 +0,0 @@ -#!/usr/bin/env perl -# Copyright 2010-2011 Microsoft Corporation - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - - -# creates integer lists of silence and non-silence phones in files, -# e.g. silphones.csl="1:2:3 \n" -# and nonsilphones.csl="4:5:6:7:...:24\n"; - -if(@ARGV != 4) { - die "Usage: silphones.pl phones.txt \"sil1 sil2 sil3\" silphones.csl nonsilphones.csl"; -} - -($symtab, $sillist, $silphones, $nonsilphones) = @ARGV; -open(S,"<$symtab") || die "Opening symbol table $symtab"; - - -foreach $s (split(" ", $sillist)) { - $issil{$s} = 1; -} - -@sil = (); -@nonsil = (); -while(){ - @A = split(" ", $_); - @A == 2 || die "Bad line $_ in phone-symbol-table file $symtab"; - ($sym, $int) = @A; - if($int != 0) { - if($issil{$sym}) { push @sil, $int; $seensil{$sym}=1; } - else { push @nonsil, $int; } - } -} - -foreach $k(keys %issil) { - if(!$seensil{$k}) { die "No such silence phone $k"; } -} -open(F, ">$silphones") || die "opening silphones file $silphones"; -open(G, ">$nonsilphones") || die "opening nonsilphones file $nonsilphones"; -print F join(":", @sil) . "\n"; -print G join(":", @nonsil) . "\n"; -close(F); -close(G); -if(@sil == 0) { print STDERR "Warning: silphones.pl no silence phones.\n" } -if(@nonsil == 0) { print STDERR "Warning: silphones.pl no non-silence phones.\n" } - diff --git a/egs/timit/s3/scripts/spk2utt_to_utt2spk.pl b/egs/timit/s3/scripts/spk2utt_to_utt2spk.pl deleted file mode 100755 index 23992f25dea..00000000000 --- a/egs/timit/s3/scripts/spk2utt_to_utt2spk.pl +++ /dev/null @@ -1,27 +0,0 @@ -#!/usr/bin/env perl -# Copyright 2010-2011 Microsoft Corporation - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - - -while(<>){ - @A = split(" ", $_); - @A > 1 || die "Invalid line in spk2utt file: $_"; - $s = shift @A; - foreach $u ( @A ) { - print "$u $s\n"; - } -} - - diff --git a/egs/timit/s3/scripts/split_scp.pl b/egs/timit/s3/scripts/split_scp.pl deleted file mode 100755 index 9ffb29b76f2..00000000000 --- a/egs/timit/s3/scripts/split_scp.pl +++ /dev/null @@ -1,182 +0,0 @@ -#!/usr/bin/env perl -use warnings; #sed replacement for -w perl parameter -# Copyright 2010-2011 Microsoft Corporation - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - - - -# This program splits up any kind of .scp or archive-type file. -# If there is no utt2spk option it will work on any text file and -# will split it up with an approximately equal number of lines in -# each but. -# With the --utt2spk option it will work on anything that has the -# utterance-id as the first entry on each line; the utt2spk file is -# of the form "utterance speaker" (on each line). -# It splits it into equal size chunks as far as it can. If you use -# the utt2spk option it will make sure these chunks coincide with -# speaker boundaries. In this case, if there are more chunks -# than speakers (and in some other circumstances), some of the -# resulting chunks will be empty and it -# will print a warning. -# You will normally call this like: -# split_scp.pl scp scp.1 scp.2 scp.3 ... -# or -# split_scp.pl --utt2spk=utt2spk scp scp.1 scp.2 scp.3 ... -# Note that you can use this script to split the utt2spk file itself, -# e.g. split_scp.pl --utt2spk=utt2spk utt2spk utt2spk.1 utt2spk.2 ... - -if(@ARGV < 2 ) { - die "Usage: split_scp.pl [--utt2spk=] in.scp out1.scp out2.scp ... "; -} - -if($ARGV[0] =~ m:^-:) { - # Everything inside this block - # corresponds to what we do when the --utt2spk option is used. - $opt = shift @ARGV; - @A = split("=", $opt); - if(@A != 2 || $A[0] ne "--utt2spk") { - die "split_scp.pl: invalid option $ARGV[0]"; - } - $utt2spk_file = $A[1]; - open(U, "<$utt2spk_file") || die "Failed to open utt2spk file $utt2spk_file"; - while() { - @A = split; - @A == 2 || die "Bad line $_ in utt2spk file $utt2spk_file"; - ($u,$s) = @A; - $utt2spk{$u} = $s; - } - $inscp = shift @ARGV; - open(I, "<$inscp") || die "Opening input scp file $inscp"; - @spkrs = (); - while() { - @A = split; - if(@A == 0) { die "Empty or space-only line in scp file $inscp"; } - $u = $A[0]; - $s = $utt2spk{$u}; - if(!defined $s) { die "No such utterance $u in utt2spk file $utt2spk_file"; } - if(!defined $spk_count{$s}) { - push @spkrs, $s; - $spk_count{$s} = 0; - $spk_data{$s} = ""; - } - $spk_count{$s}++; - $spk_data{$s} = $spk_data{$s} . $_; - } - # Now split as equally as possible .. - # First allocate spks to files by given approximately - # equal #spks. - $numspks = @spkrs; # number of speakers. - $numscps = @ARGV; # number of output files. - $spksperscp = int( ($numspks+($numscps-1)) / $numscps); # the +$(numscps-1) forces rounding up. - for($scpidx = 0; $scpidx < $numscps; $scpidx++) { - $scparray[$scpidx] = []; # [] is array reference. - for($n = $spksperscp * $scpidx; - $n < $numspks && $n < $spksperscp*($scpidx+1); - $n++) { - $spk = $spkrs[$n]; - push @{$scparray[$scpidx]}, $spk; - $scpcount[$scpidx] += $spk_count{$spk}; - } - } - # Now will try to reassign beginning + ending speakers - # to different scp's and see if it gets more balanced. - # Suppose objf we're minimizing is sum_i (num utts in scp[i] - average)^2. - # We can show that if considering changing just 2 scp's, we minimize - # this by minimizing the squared difference in sizes. This is - # equivalent to minimizing the absolute difference in sizes. This - # shows this method is bound to converge. - - $changed = 1; - while($changed) { - $changed = 0; - for($scpidx = 0; $scpidx < $numscps; $scpidx++) { - # First try to reassign ending spk of this scp. - if($scpidx < $numscps-1) { - $sz = @{$scparray[$scpidx]}; - if($sz > 0) { - $spk = $scparray[$scpidx]->[$sz-1]; - $count = $spk_count{$spk}; - $nutt1 = $scpcount[$scpidx]; - $nutt2 = $scpcount[$scpidx+1]; - if( abs( ($nutt2+$count) - ($nutt1-$count)) - < abs($nutt2 - $nutt1)) { # Would decrease - # size-diff by reassigning spk... - $scpcount[$scpidx+1] += $count; - $scpcount[$scpidx] -= $count; - pop @{$scparray[$scpidx]}; - unshift @{$scparray[$scpidx+1]}, $spk; - $changed = 1; - } - } - } - if($scpidx > 0 && @{$scparray[$scpidx]} > 0) { - $spk = $scparray[$scpidx]->[0]; - $count = $spk_count{$spk}; - $nutt1 = $scpcount[$scpidx-1]; - $nutt2 = $scpcount[$scpidx]; - if( abs( ($nutt2-$count) - ($nutt1+$count)) - < abs($nutt2 - $nutt1)) { # Would decrease - # size-diff by reassigning spk... - $scpcount[$scpidx-1] += $count; - $scpcount[$scpidx] -= $count; - shift @{$scparray[$scpidx]}; - push @{$scparray[$scpidx-1]}, $spk; - $changed = 1; - } - } - } - } - # Now print out the files... - for($scpidx = 0; $scpidx < $numscps; $scpidx++) { - $scpfn = $ARGV[$scpidx]; - open(F, ">$scpfn") || die "Could not open scp file $scpfn for writing."; - $count = 0; - if(@{$scparray[$scpidx]} == 0) { - print STDERR "Warning: split_scp.pl producing empty .scp file $scpfn (too many splits and too few speakers?)"; - } - foreach $spk ( @{$scparray[$scpidx]} ) { - print F $spk_data{$spk}; - $count += $spk_count{$spk}; - } - if($count != $scpcount[$scpidx]) { die "Count mismatch [code error]"; } - close(F); - } -} else { - # This block is the "normal" case where there is no --utt2spk - # option and we just break into equal size chunks. - - $inscp = shift @ARGV; - open(I, "<$inscp") || die "Opening input scp file $inscp"; - - $numscps = @ARGV; # size of array. - @F = (); - while() { - push @F, $_; - } - $numlines = @F; - if($numlines == 0) { - print STDERR "split_scp.pl: warning: empty input scp file $inscp"; - } - $linesperscp = int( ($numlines+($numscps-1)) / $numscps); # the +$(numscps-1) forces rounding up. -# [just doing int() rounds down]. - for($scpidx = 0; $scpidx < @ARGV; $scpidx++) { - $scpfile = $ARGV[$scpidx]; - open(O, ">$scpfile") || die "Opening output scp file $scpfile"; - for($n = $linesperscp * $scpidx; $n < $numlines && $n < $linesperscp*($scpidx+1); $n++) { - print O $F[$n]; - } - close(O) || die "Closing scp file $scpfile"; - } -} diff --git a/egs/timit/s3/scripts/sym2int.pl b/egs/timit/s3/scripts/sym2int.pl deleted file mode 100755 index ee22d3f13bd..00000000000 --- a/egs/timit/s3/scripts/sym2int.pl +++ /dev/null @@ -1,59 +0,0 @@ -#!/usr/bin/env perl -# Copyright 2010-2011 Microsoft Corporation - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - - -$ignore_oov = 0; -$ignore_first_field = 0; -for($x = 0; $x < 2; $x++) { - if($ARGV[0] eq "--ignore-oov") { $ignore_oov = 1; shift @ARGV; } - if($ARGV[0] eq "--ignore-first-field") { $ignore_first_field = 1; shift @ARGV; } -} - -$symtab = shift @ARGV; -if(!defined $symtab) { - die "Usage: sym2int.pl symtab [input transcriptions] > output transcriptions\n"; -} -open(F, "<$symtab") || die "Error opening symbol table file $symtab"; -while() { - @A = split(" ", $_); - @A == 2 || die "bad line in symbol table file: $_"; - $sym2int{$A[0]} = $A[1] + 0; -} - -while(<>) { - @A = split(" ", $_); - if(@A == 0) { - die "Empty line in transcriptions input."; - } - if($ignore_first_field) { - $key = shift @A; - print $key . " "; - } - foreach $a (@A) { - $i = $sym2int{$a}; - if(!defined ($i)) { - if($ignore_oov) { - print $a . " " ; - } else { - die "sym2int.pl: undefined symbol $a\n"; - } - } - print $i . " "; - } - print "\n"; -} - - diff --git a/egs/timit/s3/scripts/utt2spk_to_spk2utt.pl b/egs/timit/s3/scripts/utt2spk_to_spk2utt.pl deleted file mode 100755 index f5e61459bc9..00000000000 --- a/egs/timit/s3/scripts/utt2spk_to_spk2utt.pl +++ /dev/null @@ -1,33 +0,0 @@ -#!/usr/bin/env perl -# Copyright 2010-2011 Microsoft Corporation - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - - - -while(<>){ - @A = split(" ", $_); - @A == 2 || die "Invalid line in utt2spk file: $_"; - ($u,$s) = @A; - if(!$seen_spk{$s}) { - $seen_spk{$s} = 1; - push @spklist, $s; - } - $uttlist{$s} = $uttlist{$s} . "$u "; -} -foreach $s (@spklist) { - $l = $uttlist{$s}; - $l =~ s: $::; # remove trailing space. - print "$s $l\n"; -} diff --git a/egs/timit/s3/steps/align_deltas.sh b/egs/timit/s3/steps/align_deltas.sh deleted file mode 100755 index fd24edb789a..00000000000 --- a/egs/timit/s3/steps/align_deltas.sh +++ /dev/null @@ -1,81 +0,0 @@ -#!/bin/bash -# Copyright 2010-2011 Microsoft Corporation Arnab Ghoshal - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - -# To be run from .. - -# This script does training-data alignment given a model built using -# CMN + delta + delta-delta features. Its output, all in its own -# experimental directory, is cmvn.ark, ali, tree, and final.mdl -# (the last two are just copied from the source directory). - -# Option to use precompiled graphs from last phase, if these -# are available (i.e. if they were built with the same data). - -graphs= -if [ "$1" == --graphs ]; then - shift; - graphs=$1 - shift -fi - - -if [ $# != 4 ]; then - echo "Usage: steps/align_deltas.sh " - echo " e.g.: steps/align_deltas.sh data/train data/lang exp/tri1 exp/tri1_ali" - exit 1; -fi - -if [ -f path.sh ]; then . path.sh; fi - -data=$1 -lang=$2 -srcdir=$3 -dir=$4 - - -model=$srcdir/final.mdl - - -mkdir -p $dir -cp $model $dir/final.mdl || exit 1; # Create copy of that model... -cp $srcdir/tree $dir/tree || exit 1; # and the tree... - -scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1" - - - -echo "Computing cepstral mean and variance statistics" -compute-cmvn-stats --spk2utt=ark:$data/spk2utt scp:$data/feats.scp \ - ark:$dir/cmvn.ark 2>$dir/cmvn.log || exit 1; - -feats="ark:apply-cmvn --norm-vars=false --utt2spk=ark:$data/utt2spk ark:$dir/cmvn.ark scp:$data/feats.scp ark:- | add-deltas ark:- ark:- |" - -# Align all training data using the supplied model. - -echo "Aligning all training data" -if [ -z "$graphs" ]; then # --graphs option not supplied [-z means empty string] - # compute integer form of transcripts. - scripts/sym2int.pl --ignore-first-field $lang/words.txt < $data/text > $dir/train.tra \ - || exit 1; - gmm-align $scale_opts --beam=8 --retry-beam=40 $srcdir/tree $model $lang/L.fst \ - "$feats" ark:$dir/train.tra ark:$dir/ali 2> $dir/align.log || exit 1; - rm $dir/train.tra -else - gmm-align-compiled $scale_opts --beam=8 --retry-beam=40 $model \ - "$graphs" "$feats" ark:$dir/ali 2> $dir/align.log || exit 1; -fi - -echo "Done." diff --git a/egs/timit/s3/steps/decode_deltas.sh b/egs/timit/s3/steps/decode_deltas.sh deleted file mode 100755 index 9f886a79e2c..00000000000 --- a/egs/timit/s3/steps/decode_deltas.sh +++ /dev/null @@ -1,77 +0,0 @@ -#!/bin/bash - -# Copyright 2010-2011 Microsoft Corporation - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - -# Decoding script that works with a GMM model and delta-delta plus -# cepstral mean subtraction features. Used, for example, to decode -# mono/ and tri1/ - -if [ $# != 4 ]; then - echo "Usage: steps/decode_deltas.sh " - echo " e.g.: steps/decode_deltas.sh exp/mono data/test_feb89 data/test_lang exp/mono/decode_feb89" - exit 1; -fi - -srcdir=$1 -data=$2 -lang=$3 -dir=$4 -graphdir=$srcdir/graph - -mkdir -p $dir - -if [ -f path.sh ]; then . path.sh; fi - -if [ ! -f $srcdir/final.mdl ]; then - echo No model file $srcdir/final.mdl - exit 1; -fi - -if [[ ! -f $graphdir/HCLG.fst || $graphdir/HCLG.fst -ot $srcdir/final.mdl ]]; then - echo "Graph $graphdir/HCLG.fst does not exist or is too old." - exit 1; -fi - -# We only do one decoding pass, so there is no point caching the -# CMVN stats-- we make them part of a pipe. -feats="ark:compute-cmvn-stats --spk2utt=ark:$data/spk2utt scp:$data/feats.scp ark:- | apply-cmvn --norm-vars=false --utt2spk=ark:$data/utt2spk ark:- scp:$data/feats.scp ark:- | add-deltas ark:- ark:- |" - -# For Resource Management, we use beam of 30 and acwt of 1/7. -# More normal, LVCSR setups would have a beam of 13 and acwt of 1/15 or so. -# If you decode with a beam of 20 on an LVCSR setup it will be very slow. - -gmm-decode-faster --beam=30.0 --acoustic-scale=0.1429 --word-symbol-table=$lang/words.txt \ - $srcdir/final.mdl $graphdir/HCLG.fst "$feats" ark,t:$dir/test.tra ark,t:$dir/test.ali \ - 2> $dir/decode.log || exit 1; - -# In this setup there are no non-scored words, so -# scoring is simple. - -# the ,p option lets it score partial output without dying.. - -#scripts/sym2int.pl --ignore-first-field $lang/words.txt $data/text | \ -# compute-wer --mode=present ark:- ark,p:$dir/test.tra >& $dir/wer - -mapping="en:n,ao:aa,ax-h:ah,ax:ah,ix:ih,el:l,zh:sh,ux:uw,axr:er,em:m,nx:n,eng:ng,hv:hh,pcl:pau,tcl:pau,kcl:pau,q:pau,bcl:pau,dcl:pau,gcl:pau,epi:pau" -scripts/collapse_phones.pl --ignore-first-field $lang/words.txt "$mapping" < $dir/test.tra > tmp -scripts/sym2int.pl --ignore-first-field $lang/words.txt $data/text | \ - scripts/collapse_phones.pl --ignore-first-field $lang/words.txt "$mapping" |\ - compute-wer --mode=present ark:- ark,p:tmp >& $dir/wer - -rm tmp - - - diff --git a/egs/timit/s3/steps/make_mfcc.sh b/egs/timit/s3/steps/make_mfcc.sh deleted file mode 100755 index dc5b01c5f59..00000000000 --- a/egs/timit/s3/steps/make_mfcc.sh +++ /dev/null @@ -1,83 +0,0 @@ -#!/bin/bash -# Copyright 2010-2011 Microsoft Corporation - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - -# To be run from .. (one directory up from here) - -if [ $# != 4 ]; then - echo "usage: make_mfcc.sh "; - exit 1; -fi - -if [ -f path.sh ]; then . path.sh; fi - -data=$1 -logdir=$2 -mfccdir=$3 -ncpus=$4 - -# use "name" as part of name of the archive. -name=`basename $data` - -mkdir -p $mfccdir || exit 1; -mkdir -p $logdir || exit 1; - -scp=$data/wav.scp -config=conf/mfcc.conf -required="$scp $config" - -for f in $required; do - if [ ! -f $f ]; then - echo "make_mfcc.sh: no such file $f" - exit 1; - fi -done - -# note: in general, the double-parenthesis construct in bash "((" is "C-style -# syntax" where we can get rid of the $ for variable names, and omit spaces. -# The "for" loop in this style is a special construct. - -split_scps="" -for ((n=1; n<=ncpus; n++)); do - split_scps="$split_scps $logdir/wav$n.scp" -done - -scripts/split_scp.pl $scp $split_scps || exit 1; - -rm $logdir/.error 2>/dev/null -for ((n=1; n<=ncpus; n++)); do - log=$logdir/make_mfcc.$n.log - compute-mfcc-feats --verbose=2 --config=$config scp:$logdir/wav${n}.scp \ - ark,scp:$mfccdir/raw_mfcc_$name.$n.ark,$mfccdir/raw_mfcc_$name.$n.scp \ - 2> $log || touch $logdir/.error & -done -wait; - -if [ -f $logdir/.error.$name ]; then - echo "Error producing mfcc features for $name:" - tail $logdir/make_mfcc.*.log - exit 1; -fi - -# concatenate the .scp files together. -rm $data/feats.scp 2>/dev/null -for ((n=1; n<=ncpus; n++)); do - cat $mfccdir/raw_mfcc_$name.$n.scp >> $data/feats.scp -done - -rm $logdir/wav*.scp - -echo "Succeeded creating MFCC features for $name" - diff --git a/egs/timit/s3/steps/train_deltas.sh b/egs/timit/s3/steps/train_deltas.sh deleted file mode 100755 index 4a80f74a939..00000000000 --- a/egs/timit/s3/steps/train_deltas.sh +++ /dev/null @@ -1,128 +0,0 @@ -#!/bin/bash -# Copyright 2010-2011 Microsoft Corporation Arnab Ghoshal - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - -# To be run from .. -# Triphone model training, using delta-delta features and cepstral -# mean normalization. It starts from an existing directory (e.g. -# exp/mono), supplied as an argument, which is assumed to be built using -# the same type of features. - -if [ $# != 4 ]; then - echo "Usage: steps/train_deltas.sh " - echo " e.g.: steps/train_deltas.sh data/train data/lang exp/mono_ali exp/tri1" - exit 1; -fi - -if [ -f path.sh ]; then . path.sh; fi - -data=$1 -lang=$2 -alidir=$3 -dir=$4 - -if [ ! -f $alidir/final.mdl -o ! -f $alidir/ali ]; then - echo "Error: alignment dir $alidir does not contain final.mdl and ali" - exit 1; -fi - -scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1" -realign_iters="5 10 15 20"; -silphonelist=`cat $lang/silphones.csl` -numiters=25 # Number of iterations of training -maxiterinc=15 # Last iter to increase #Gauss on. -numleaves=1800 # target num-leaves in tree building. -numgauss=$[$numleaves + $numleaves/2]; # starting num-Gauss. - # Initially mix up to avg. 1.5 Gauss/state ( a bit more - # than this, due to state clustering... then slowly mix - # up to final amount. -totgauss=9000 # Target #Gaussians -incgauss=$[($totgauss-$numgauss)/$maxiterinc] # per-iter increment for #Gauss - - -mkdir -p $dir - - -feats="ark:apply-cmvn --norm-vars=false --utt2spk=ark:$data/utt2spk ark:$alidir/cmvn.ark scp:$data/feats.scp ark:- | add-deltas ark:- ark:- |" - -# compute integer form of transcripts. -scripts/sym2int.pl --ignore-first-field $lang/words.txt < $data/text > $dir/train.tra \ - || exit 1; - - -echo "Accumulating tree stats" -acc-tree-stats --ci-phones=$silphonelist $alidir/final.mdl "$feats" \ - ark:$alidir/ali $dir/treeacc 2> $dir/acc.tree.log || exit 1; - - -echo "Computing questions for tree clustering" - -cat $lang/phones.txt | awk '{print $NF}' | grep -v -w 0 > $dir/phones.list -cluster-phones $dir/treeacc $dir/phones.list $dir/questions.txt 2> $dir/questions.log || exit 1; -scripts/int2sym.pl $lang/phones.txt < $dir/questions.txt > $dir/questions_syms.txt -compile-questions $lang/topo $dir/questions.txt $dir/questions.qst 2>$dir/compile_questions.log || exit 1; - -# Have to make silence root not-shared because we will not split it. -scripts/make_roots.pl --separate $lang/phones.txt $silphonelist shared split \ - > $dir/roots.txt 2>$dir/roots.log || exit 1; - - -echo "Building tree" -build-tree --verbose=1 --max-leaves=$numleaves \ - $dir/treeacc $dir/roots.txt \ - $dir/questions.qst $lang/topo $dir/tree 2> $dir/train_tree.log || exit 1; - -gmm-init-model --write-occs=$dir/1.occs \ - $dir/tree $dir/treeacc $lang/topo $dir/1.mdl 2> $dir/init_model.log || exit 1; - -gmm-mixup --mix-up=$numgauss $dir/1.mdl $dir/1.occs $dir/1.mdl \ - 2>$dir/mixup.log || exit 1; - -rm $dir/treeacc - -# Convert alignments generated from monophone model, to use as initial alignments. - -convert-ali $alidir/final.mdl $dir/1.mdl $dir/tree ark:$alidir/ali ark:$dir/cur.ali 2>$dir/convert.log - # Debug step only: convert back and check they're the same. - convert-ali $dir/1.mdl $alidir/final.mdl $alidir/tree ark:$dir/cur.ali ark:- \ - 2>/dev/null | cmp - $alidir/ali || exit 1; - -# Make training graphs -echo "Compiling training graphs" -compile-train-graphs $dir/tree $dir/1.mdl $lang/L.fst ark:$dir/train.tra \ - "ark:|gzip -c >$dir/graphs.fsts.gz" 2>$dir/compile_graphs.log || exit 1; - -x=1 -while [ $x -lt $numiters ]; do - echo Pass $x - if echo $realign_iters | grep -w $x >/dev/null; then - echo "Aligning data" - gmm-align-compiled $scale_opts --beam=8 --retry-beam=40 $dir/$x.mdl \ - "ark:gunzip -c $dir/graphs.fsts.gz|" "$feats" \ - ark:$dir/cur.ali 2> $dir/align.$x.log || exit 1; - fi - gmm-acc-stats-ali --binary=false $dir/$x.mdl "$feats" ark:$dir/cur.ali $dir/$x.acc 2> $dir/acc.$x.log || exit 1; - gmm-est --write-occs=$dir/$[$x+1].occs --mix-up=$numgauss $dir/$x.mdl $dir/$x.acc $dir/$[$x+1].mdl 2> $dir/update.$x.log || exit 1; - rm $dir/$x.mdl $dir/$x.acc - rm $dir/$x.occs - if [[ $x -le $maxiterinc ]]; then - numgauss=$[$numgauss+$incgauss]; - fi - x=$[$x+1]; -done - -( cd $dir; rm final.mdl 2>/dev/null; ln -s $x.mdl final.mdl; ln -s $x.occs final.occs ) - -echo Done diff --git a/egs/timit/s3/steps/train_mono.sh b/egs/timit/s3/steps/train_mono.sh deleted file mode 100755 index 3028ba3c339..00000000000 --- a/egs/timit/s3/steps/train_mono.sh +++ /dev/null @@ -1,106 +0,0 @@ -#!/bin/bash -# Copyright 2010-2011 Microsoft Corporation Arnab Ghoshal - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - -# To be run from .. -# Flat start and monophone training, with delta-delta features. -# This script applies cepstral mean normalization (per speaker), -# unlike the corresponding script in s1/ - -if [ $# != 3 ]; then - echo "Usage: steps/train_mono.sh " - echo " e.g.: steps/train_mono.sh data/train.1k data/lang exp/mono" - exit 1; -fi - - -data=$1 -lang=$2 -dir=$3 - -if [ -f path.sh ]; then . path.sh; fi - -# Configuration: -scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1" -numiters=30 # Number of iterations of training -maxiterinc=20 # Last iter to increase #Gauss on. -numgauss=250 # Initial num-Gauss (must be more than #states=3*phones). -totgauss=1000 # Target #Gaussians. -incgauss=$[($totgauss-$numgauss)/$maxiterinc] # per-iter increment for #Gauss -realign_iters="1 2 3 4 5 6 7 8 9 10 12 15 20 25"; - -mkdir -p $dir -echo "Computing cepstral mean and variance statistics" - -compute-cmvn-stats --spk2utt=ark:$data/spk2utt scp:$data/feats.scp \ - ark:$dir/cmvn.ark 2>$dir/cmvn.log || exit 1; - -feats="ark:apply-cmvn --norm-vars=false --utt2spk=ark:$data/utt2spk ark:$dir/cmvn.ark scp:$data/feats.scp ark:- | add-deltas ark:- ark:- |" - -# compute integer form of transcripts. -scripts/sym2int.pl --ignore-first-field $lang/words.txt < $data/text > $dir/train.tra \ - || exit 1; - -echo "Initializing monophone system." - -gmm-init-mono "--train-feats=$feats subset-feats --n=10 ark:- ark:-|" $lang/topo 39 \ - $dir/0.mdl $dir/tree 2> $dir/init.log || exit 1; - - -echo "Compiling training graphs" -compile-train-graphs $dir/tree $dir/0.mdl $lang/L.fst \ - ark:$dir/train.tra "ark:|gzip -c >$dir/graphs.fsts.gz" \ - 2>$dir/compile_graphs.log || exit 1 - -echo Pass 0 - -align-equal-compiled "ark:gunzip -c $dir/graphs.fsts.gz|" "$feats" \ - ark,t,f:- 2>$dir/align.0.log | \ - gmm-acc-stats-ali --binary=true $dir/0.mdl "$feats" ark:- \ - $dir/0.acc 2> $dir/acc.0.log || exit 1; - -# In the following steps, the --min-gaussian-occupancy=3 option is important, otherwise -# we fail to est "rare" phones and later on, they never align properly. - -gmm-est --min-gaussian-occupancy=3 --mix-up=$numgauss \ - $dir/0.mdl $dir/0.acc $dir/1.mdl 2> $dir/update.0.log || exit 1; - -rm $dir/0.acc - -beam=4 # will change to 8 below after 1st pass -x=1 -while [ $x -lt $numiters ]; do - echo "Pass $x" - if echo $realign_iters | grep -w $x >/dev/null; then - echo "Aligning data" - gmm-align-compiled $scale_opts --beam=$beam --retry-beam=$[$beam*4] $dir/$x.mdl \ - "ark:gunzip -c $dir/graphs.fsts.gz|" "$feats" t,ark:$dir/cur.ali \ - 2> $dir/align.$x.log || exit 1; - fi - gmm-acc-stats-ali --binary=false $dir/$x.mdl "$feats" ark:$dir/cur.ali $dir/$x.acc 2> $dir/acc.$x.log || exit 1; - gmm-est --mix-up=$numgauss $dir/$x.mdl $dir/$x.acc $dir/$[$x+1].mdl 2> $dir/update.$x.log || exit 1; - rm $dir/$x.mdl $dir/$x.acc - if [ $x -le $maxiterinc ]; then - numgauss=$[$numgauss+$incgauss]; - fi - beam=8 - x=$[$x+1] -done - -( cd $dir; rm final.mdl 2>/dev/null; ln -s $x.mdl final.mdl ) - -# example of showing the alignments: -# show-alignments data/lang/phones.txt $dir/30.mdl ark:$dir/cur.ali | head -4 - diff --git a/egs/timit/s4/RESULTS b/egs/timit/s4/RESULTS deleted file mode 100644 index f11d53b6fdd..00000000000 --- a/egs/timit/s4/RESULTS +++ /dev/null @@ -1,24 +0,0 @@ -exp/mono/decode_dev_bg/wer_3 -compute-wer --text --mode=present ark:exp/mono/decode_dev_bg/test_trans.filt ark,p:- -%WER 33.73 [ 5079 / 15057, 392 ins, 1716 del, 2971 sub ] -%SER 100.00 [ 400 / 400 ] -Scored 400 sentences, 0 not present in hyp. - -exp/mono/decode_test_bg/wer -compute-wer --text --mode=present ark:exp/mono/decode_test_bg/test.trans ark,p:exp/mono/decode_test_bg/text -%WER 35.68 [ 2574 / 7215, 204 ins, 848 del, 1522 sub ] -%SER 100.00 [ 192 / 192 ] -Scored 192 sentences, 0 not present in hyp. - -exp/tri1/decode_dev_bg/wer_6 -compute-wer --text --mode=present ark:exp/tri1/decode_dev_bg/test.trans ark,p:- -%WER 28.68 [ 4319 / 15057, 474 ins, 1333 del, 2512 sub ] -%SER 100.00 [ 400 / 400 ] -Scored 400 sentences, 0 not present in hyp. - -exp/tri1/decode_test_bg/wer -compute-wer --text --mode=present ark:exp/tri1/decode_test_bg/test.trans ark,p:exp/tri1/decode_test_bg/text -%WER 31.02 [ 2238 / 7215, 226 ins, 704 del, 1308 sub ] -%SER 100.00 [ 192 / 192 ] -Scored 192 sentences, 0 not present in hyp. - diff --git a/egs/timit/s4/conf/dev_spk.list b/egs/timit/s4/conf/dev_spk.list deleted file mode 100644 index 564da1f1ec6..00000000000 --- a/egs/timit/s4/conf/dev_spk.list +++ /dev/null @@ -1,50 +0,0 @@ -faks0 -fdac1 -fjem0 -mgwt0 -mjar0 -mmdb1 -mmdm2 -mpdf0 -fcmh0 -fkms0 -mbdg0 -mbwm0 -mcsh0 -fadg0 -fdms0 -fedw0 -mgjf0 -mglb0 -mrtk0 -mtaa0 -mtdt0 -mthc0 -mwjg0 -fnmr0 -frew0 -fsem0 -mbns0 -mmjr0 -mdls0 -mdlf0 -mdvc0 -mers0 -fmah0 -fdrw0 -mrcs0 -mrjm4 -fcal1 -mmwh0 -fjsj0 -majc0 -mjsw0 -mreb0 -fgjd0 -fjmg0 -mroa0 -mteb0 -mjfc0 -mrjr0 -fmml0 -mrws1 diff --git a/egs/timit/s4/conf/phones.60-48-39.map b/egs/timit/s4/conf/phones.60-48-39.map deleted file mode 100644 index 4ebcc140fe7..00000000000 --- a/egs/timit/s4/conf/phones.60-48-39.map +++ /dev/null @@ -1,61 +0,0 @@ -aa aa aa -ae ae ae -ah ah ah -ao ao aa -aw aw aw -ax ax ah -ax-h ax ah -axr er er -ay ay ay -b b b -bcl vcl sil -ch ch ch -d d d -dcl vcl sil -dh dh dh -dx dx dx -eh eh eh -el el l -em m m -en en n -eng ng ng -epi epi sil -er er er -ey ey ey -f f f -g g g -gcl vcl sil -h# sil sil -hh hh hh -hv hh hh -ih ih ih -ix ix ih -iy iy iy -jh jh jh -k k k -kcl cl sil -l l l -m m m -n n n -ng ng ng -nx n n -ow ow ow -oy oy oy -p p p -pau sil sil -pcl cl sil -q -r r r -s s s -sh sh sh -t t t -tcl cl sil -th th th -uh uh uh -uw uw uw -ux uw uw -v v v -w w w -y y y -z z z -zh zh sh diff --git a/egs/timit/s4/conf/test_spk.list b/egs/timit/s4/conf/test_spk.list deleted file mode 100644 index 47f6653d64d..00000000000 --- a/egs/timit/s4/conf/test_spk.list +++ /dev/null @@ -1,24 +0,0 @@ -mdab0 -mwbt0 -felc0 -mtas1 -mwew0 -fpas0 -mjmp0 -mlnt0 -fpkt0 -mlll0 -mtls0 -fjlm0 -mbpm0 -mklt0 -fnlp0 -mcmj0 -mjdh0 -fmgd0 -mgrt0 -mnjm0 -fdhc0 -mjln0 -mpam0 -fmld0 diff --git a/egs/timit/s4/conf/topo.proto b/egs/timit/s4/conf/topo.proto deleted file mode 100644 index 72778cb66ba..00000000000 --- a/egs/timit/s4/conf/topo.proto +++ /dev/null @@ -1,20 +0,0 @@ - - - -NONSILENCEPHONES - - 0 0 0 0.75 1 0.25 - 1 1 1 0.75 2 0.25 - 2 2 2 0.75 3 0.25 - 3 - - - -SILENCEPHONES - - 0 0 0 0.75 1 0.25 - 1 1 1 0.75 2 0.25 - 2 2 2 0.75 3 0.25 - 3 - - diff --git a/egs/timit/s4/local/timit_data_prep.sh b/egs/timit/s4/local/timit_data_prep.sh deleted file mode 100755 index 7636d6aee0d..00000000000 --- a/egs/timit/s4/local/timit_data_prep.sh +++ /dev/null @@ -1,127 +0,0 @@ -#!/bin/bash -u - -# Copyright 2012 Arnab Ghoshal - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - -set -o errexit - -function error_exit () { - echo -e "$@" >&2; exit 1; -} - -function read_dirname () { - local dir_name=`expr "X$1" : '[^=]*=\(.*\)'`; - [ -d "$dir_name" ] || error_exit "Argument '$dir_name' not a directory"; - local retval=`cd $dir_name 2>/dev/null && pwd || exit 1` - echo $retval -} - -PROG=`basename $0`; -usage="Usage: $PROG \n -Prepare train, dev, test file lists for TIMIT.\n\n -Required arguments:\n - --config-dir=DIR\tDirecory containing the necessary config files\n - --corpus-dir=DIR\tDirectory for the GlobalPhone corpus\n - --work-dir=DIR\t\tWorking directory\n -"; - -if [ $# -lt 3 ]; then - error_exit $usage; -fi - -while [ $# -gt 0 ]; -do - case "$1" in - --help) echo -e $usage; exit 0 ;; - --config-dir=*) - CONFDIR=`read_dirname $1`; shift ;; - --corpus-dir=*) - CORPUS=`read_dirname $1`; shift ;; - --work-dir=*) - WDIR=`read_dirname $1`; shift ;; - *) echo "Unknown argument: $1, exiting"; echo -e $usage; exit 1 ;; - esac -done - -# (1) check if the config files are in place: -cd $CONFDIR -[ -f test_spk.list ] || error_exit "$PROG: Eval-set speaker list not found."; - -cd $WDIR -[ -f path.sh ] && . path.sh # Sets the PATH to contain necessary executables - -# (2) get the various file lists (for audio, transcription, etc.) -mkdir -p data/local -timit_prep_flists.sh --corpus-dir=$CORPUS --dev-spk=$CONFDIR/dev_spk.list \ - --test-spk=$CONFDIR/test_spk.list --work-dir=data - -# (3) Normalize the transcripts. -timit_norm_trans.pl -i data/local/train.trans -m $CONFDIR/phones.60-48-39.map \ - -to 48 > data/local/train.trans2; -for x in dev test; do - timit_norm_trans.pl -i data/local/${x}.trans -m $CONFDIR/phones.60-48-39.map \ - -to 39 > data/local/${x}.trans2; -done - -# Create the lexicon, which is just an identity mapping -cut -d' ' -f2- data/local/train.trans2 | tr ' ' '\n' | sort -u > data/local/p -paste data/local/p data/local/p > data/local/lexicon.txt - -# add disambig symbols to the lexicon: TODO: delete -ndisambig=`add_lex_disambig.pl data/local/lexicon.txt data/local/lexicon_disambig.txt` -ndisambig=$[$ndisambig+1]; # add one disambig symbol for silence -echo $ndisambig > data/local/lex_ndisambig - -# Get the list of phones and map them to integers (adding the null symbol -# to the list). -cut -f2 data/local/lexicon.txt \ - | awk 'BEGIN{ print " 0"; } { printf("%s %d\n", $1, NR); }' \ - > data/local/phones.txt - -# Get the list of words: -cut -f1 data/local/lexicon.txt \ - | awk 'BEGIN{print " 0";} {printf("%s %d\n", $1, NR);} - END{printf("#0 %d\n", NR+1);}' > data/local/words.txt - -# (4) Create the phone bigram LM -( -if [ -z $IRSTLM ] ; then - export IRSTLM=$KALDI_ROOT/tools/irstlm/ -fi -export PATH=${PATH}:$IRSTLM/bin -if ! command -v prune-lm >/dev/null 2>&1 ; then - echo "$0: Error: the IRSTLM is not available or compiled" >&2 - echo "$0: Error: We used to install it by default, but." >&2 - echo "$0: Error: this is no longer the case." >&2 - echo "$0: Error: To install it, go to $KALDI_ROOT/tools" >&2 - echo "$0: Error: and run extras/install_irstlm.sh" >&2 - exit 1 -fi - -cut -d' ' -f2- $srcdir/text | sed -e 's:^: :' -e 's:$: :' \ - > $srcdir/lm_train - -cut -d' ' -f2- data/local/train.trans2 | sed -e 's:^: :' -e 's:$: :' \ - > data/local/lm_train.txt - -build-lm.sh -i data/local/lm_train.txt -n 2 \ - -o data/local/lm_phone_bg.ilm.gz - -compile-lm data/local/lm_phone_bg.ilm.gz --text yes /dev/stdout \ - | grep -v unk | gzip -c > data/local/lm_phone_bg.arpa.gz - -) >& data/prepare_lm.log - -echo "Finished data preparation." diff --git a/egs/timit/s4/local/timit_format_data.sh b/egs/timit/s4/local/timit_format_data.sh deleted file mode 100755 index 5b8fa1c5169..00000000000 --- a/egs/timit/s4/local/timit_format_data.sh +++ /dev/null @@ -1,137 +0,0 @@ -#!/bin/bash -u - -# Copyright 2012 Arnab Ghoshal -# Copyright 2010-2011 Microsoft Corporation - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - -set -o errexit -set -o pipefail - -function error_exit () { - echo -e "$@" >&2; exit 1; -} - -function read_dirname () { - local dir_name=`expr "X$1" : '[^=]*=\(.*\)'`; - [ -d "$dir_name" ] || error_exit "Argument '$dir_name' not a directory"; - local retval=`cd $dir_name 2>/dev/null && pwd || exit 1` - echo $retval -} - -PROG=`basename $0`; -usage="Usage: $PROG \n -Prepare train, dev, test file lists.\n\n -Required arguments:\n - --hmm-proto=FILE\tPrototype of the HMM topology\n - --work-dir=DIR\t\tWorking directory\n -"; - -if [ $# -lt 2 ]; then - error_exit $usage; -fi - -while [ $# -gt 0 ]; -do - case "$1" in - --help) echo -e $usage; exit 0 ;; - --hmm-proto=*) - PROTO=`expr "X$1" : '[^=]*=\(.*\)'`; - [ -f $PROTO ] || error_exit "Cannot find HMM prototype file '$PROTO'"; - shift ;; - --work-dir=*) - WDIR=`read_dirname $1`; shift ;; - *) echo "Unknown argument: $1, exiting"; error_exit $usage ;; - esac -done - -cd $WDIR -. path.sh - -echo "Preparing train data" - -# (0) Create a directory to contain files needed in training: -for x in train dev test; do - mkdir -p data/$x - cp data/local/${x}_wav.scp data/$x/wav.scp - cp data/local/${x}.trans2 data/$x/text - cp data/local/${x}.spk2utt data/$x/spk2utt - cp data/local/${x}.utt2spk data/$x/utt2spk -done - -mkdir -p data/lang -cp data/local/phones.txt -t data/lang/ -cp data/local/words.txt -t data/lang/ - -# (1) Generate colon-separated lists of silence and non-silence phones -silphones="cl epi sil vcl"; -silphones.pl data/lang/phones.txt "$silphones" \ - data/lang/silphones.csl data/lang/nonsilphones.csl - -# (2) Create the L.fst without disambiguation symbols, for use in training. -make_lexicon_fst.pl data/local/lexicon.txt 0.5 sil \ - | fstcompile --isymbols=data/lang/phones.txt \ - --osymbols=data/lang/words.txt --keep_isymbols=false \ - --keep_osymbols=false \ - | fstarcsort --sort_type=olabel > data/lang/L.fst - -# (3) Create phonesets.txt and extra_questions.txt. -timit_make_questions.pl -i data/lang/phones.txt \ - -m data/lang/phonesets_mono.txt -r data/lang/roots.txt -grep -v sil data/lang/phonesets_mono.txt \ - > data/lang/phonesets_cluster.txt -echo "cl epi sil vcl" > data/lang/extra_questions.txt - -# (4), Finally, for training, create the HMM topology prototype: -silphonelist=`cat data/lang/silphones.csl | sed 's/:/ /g'` -nonsilphonelist=`cat data/lang/nonsilphones.csl | sed 's/:/ /g'` -sed -e "s:NONSILENCEPHONES:$nonsilphonelist:" \ - -e "s:SILENCEPHONES:$silphonelist:" $PROTO > data/lang/topo - -echo "Preparing test data" - -# (0) Copy over some files common to traina and test: -mkdir -p data/lang_test -for f in phones.txt words.txt L.fst silphones.csl nonsilphones.csl; do - cp data/lang/$f -t data/lang_test/ -done - -# (1) Create a list of phones including the disambiguation symbols. -# --include-zero includes the #0 symbol that is passed from G.fst -ndisambig=`cat data/local/lex_ndisambig`; -add_disambig.pl --include-zero data/lang_test/phones.txt $ndisambig \ - > data/lang_test/phones_disambig.txt -cp data/lang_test/phones_disambig.txt -t data/lang/ # for MMI. - -# (2) Create the lexicon FST with disambiguation symbols. There is an extra -# step where we create a loop to "pass through" the disambiguation symbols -# from G.fst. -phone_disambig_symbol=`grep \#0 data/lang_test/phones_disambig.txt | awk '{print $2}'` -word_disambig_symbol=`grep \#0 data/lang_test/words.txt | awk '{print $2}'` - -make_lexicon_fst.pl data/local/lexicon_disambig.txt 0.5 sil '#'$ndisambig \ - | fstcompile --isymbols=data/lang_test/phones_disambig.txt \ - --osymbols=data/lang_test/words.txt --keep_isymbols=false \ - --keep_osymbols=false \ - | fstaddselfloops "echo $phone_disambig_symbol |" \ - "echo $word_disambig_symbol |" \ - | fstarcsort --sort_type=olabel > data/lang_test/L_disambig.fst - - # Needed for discriminative training -cp data/lang_test/L_disambig.fst -t data/lang/ - -# (3) Convert the language model to FST, and create decoding configuration. -timit_format_lms.sh data - -echo "Succeeded in formatting data." diff --git a/egs/timit/s4/local/timit_format_lms.sh b/egs/timit/s4/local/timit_format_lms.sh deleted file mode 100755 index c122515ff2c..00000000000 --- a/egs/timit/s4/local/timit_format_lms.sh +++ /dev/null @@ -1,71 +0,0 @@ -#!/bin/bash -u - -# Copyright 2012 Arnab Ghoshal -# Copyright 2010-2011 Microsoft Corporation - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - -set -o errexit -#set -o pipefail - -function error_exit () { - echo -e "$@" >&2; exit 1; -} - -function read_dirname () { - [ -d "$1" ] || error_exit "Argument '$1' not a directory"; - local retval=`cd $1 2>/dev/null && pwd || exit 1` - echo $retval -} - -function format_lms () { - local lm_suffix=$1; - local work_dir=$2 - local test=$work_dir/lang_test_${lm_suffix} - - mkdir -p $test - for f in phones.txt words.txt phones_disambig.txt L.fst L_disambig.fst \ - silphones.csl nonsilphones.csl; do - cp $work_dir/lang_test/$f $test - done - - # Removing all "illegal" combinations of and , which are supposed to - # occur only at being/end of utt. These can cause determinization failures - # of CLG [ends up being epsilon cycles]. - gunzip -c $work_dir/local/lm_${lm_suffix}.arpa.gz \ - | egrep -v ' | | ' \ - | arpa2fst - | fstprint \ - | eps2disambig.pl | s2eps.pl \ - | fstcompile --isymbols=$test/words.txt --osymbols=$test/words.txt \ - --keep_isymbols=false --keep_osymbols=false \ - | fstrmepsilon > $test/G.fst - set +e - fstisstochastic $test/G.fst - set -e -} - -PROG=`basename $0`; -usage="Usage: $PROG data_dir\n - Convert ARPA-format language models to FSTs.\n"; - -if [ $# -ne 1 ]; then - error_exit $usage; -fi -WDIR=`read_dirname $1`; - -# Next, for each type of language model, create the corresponding FST -# and the corresponding lang_test directory. - -echo "Preparing language models for test" -format_lms phone_bg $WDIR >& $WDIR/format_lms.log diff --git a/egs/timit/s4/local/timit_make_questions.pl b/egs/timit/s4/local/timit_make_questions.pl deleted file mode 100755 index a8b1355a63a..00000000000 --- a/egs/timit/s4/local/timit_make_questions.pl +++ /dev/null @@ -1,59 +0,0 @@ -#!/usr/bin/env perl -use warnings; #sed replacement for -w perl parameter - -# Copyright 2012 Arnab Ghoshal - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - -# 'phonesets_mono' contains sets of phones that are shared when building the -# monophone system and when asking questions based on an automatic clustering -# of phones, for the triphone system. -# 'roots' contain the information about which phones share a common root in -# the phonetic decision tree and which have distinct pdfs. It also states -# whether the tree-building should split the roots or not. - -my $usage = "Usage: timit_make_questions.pl -i phones -m phoneset_mono -r roots\ -Creates sharerd phonesets for monophone and context-dependent training.\ -Required arguments:\ - -i\tInput list of phones (can contain stress/position markers)\ - -m\tOutput shared phoneset for use in monophone training\ - -r\tOutput sharing and splitting info for context-dependent training\n"; - -use strict; -use Getopt::Long; -my ($in_phones, $mono, $roots, %phoneset); -GetOptions ("i=s" => \$in_phones, # Input list of phones - "m=s" => \$mono, # Shared phone-set for monophone system - "r=s" => \$roots ); # roots file for context-dependent systems - -die "$usage" unless(defined($in_phones) && defined($mono) && defined($roots)); - -open(P, "<$in_phones") or die "Cannot read from file '$in_phones': $!"; -open(MONO, ">$mono") or die "Cannot write to file '$mono': $!"; -open(ROOTS, ">$roots") or die "Cannot write to file '$roots': $!"; - -while (

) { - next if m/eps|sil|vcl|cl|epi/; - chomp; - m/^(\S+)(_.)?\s+\S+$/ or die "Bad line: $_\n"; - my $full_phone = defined($2)? $1.$2 : $1; - push @{$phoneset{$1}}, $full_phone; -} - -print MONO "cl epi sil vcl\n"; -print ROOTS "not-shared not-split cl epi sil vcl\n"; -foreach my $p (sort keys %phoneset) { - print MONO join(" ", @{$phoneset{$p}}), "\n"; - print ROOTS "shared split ", join(" ", @{$phoneset{$p}}), "\n"; -} diff --git a/egs/timit/s4/local/timit_norm_trans.pl b/egs/timit/s4/local/timit_norm_trans.pl deleted file mode 100755 index 07a185048d3..00000000000 --- a/egs/timit/s4/local/timit_norm_trans.pl +++ /dev/null @@ -1,90 +0,0 @@ -#!/usr/bin/env perl -use warnings; #sed replacement for -w perl parameter - -# Copyright 2012 Arnab Ghoshal - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - - -# This script normalizes the TIMIT phonetic transcripts that have been -# extracted in a format where each line contains an utterance ID followed by -# the transcript, e.g.: -# fcke0_si1111 h# hh ah dx ux w iy dcl d ix f ay n ih q h# - -my $usage = "Usage: timit_norm_trans.pl -i transcript -m phone_map -from [60|48] -to [48|39] > normalized\n -Normalizes phonetic transcriptions for TIMIT, by mapping the phones to a -smaller set defined by the -m option. This script assumes that the mapping is -done in the \"standard\" fashion, i.e. to 48 or 39 phones. The input is -assumed to have 60 phones (+1 for glottal stop, which is deleted), but that can -be changed using the -from option. The input format is assumed to be utterance -ID followed by transcript on the same line.\n"; - -use strict; -use Getopt::Long; -die "$usage" unless(@ARGV >= 1); -my ($in_trans, $phone_map, $num_phones_out); -my $num_phones_in = 60; -GetOptions ("i=s" => \$in_trans, # Input transcription - "m=s" => \$phone_map, # File containing phone mappings - "from=i" => \$num_phones_in, # Input #phones: must be 60 or 48 - "to=i" => \$num_phones_out ); # Output #phones: must be 48 or 39 - -die $usage unless(defined($in_trans) && defined($phone_map) && - defined($num_phones_out)); -if ($num_phones_in != 60 && $num_phones_in != 48) { - die "Can only used 60 or 48 for -from (used $num_phones_in)." -} -if ($num_phones_out != 48 && $num_phones_out != 39) { - die "Can only used 48 or 39 for -to (used $num_phones_out)." -} -unless ($num_phones_out < $num_phones_in) { - die "Argument to -from ($num_phones_in) must be greater than that to -to ($num_phones_out)." -} - - -open(M, "<$phone_map") or die "Cannot open mappings file '$phone_map': $!"; -my (%phonemap, %seen_phones); -my $num_seen_phones = 0; -while () { - chomp; - next if ($_ =~ /^q\s*.*$/); # Ignore glottal stops. - m:^(\S+)\s+(\S+)\s+(\S+)$: or die "Bad line: $_"; - my $mapped_from = ($num_phones_in == 60)? $1 : $2; - my $mapped_to = ($num_phones_out == 48)? $2 : $3; - if (!defined($seen_phones{$mapped_to})) { - $seen_phones{$mapped_to} = 1; - $num_seen_phones += 1; - } - $phonemap{$mapped_from} = $mapped_to; -} -if ($num_seen_phones != $num_phones_out) { - die "Trying to map to $num_phones_out phones, but seen only $num_seen_phones"; -} - -open(T, "<$in_trans") or die "Cannot open transcription file '$in_trans': $!"; -while () { - chomp; - $_ =~ m:^(\S+)\s+(.+): or die "Bad line: $_"; - my $utt_id = $1; - my $trans = $2; - - $trans =~ s/q//g; # Remove glottal stops. - $trans =~ s/^\s*//; $trans =~ s/\s*$//; # Normalize spaces - - print $utt_id; - for my $phone (split(/\s+/, $trans)) { - print " $phonemap{$phone}" - } - print "\n"; -} diff --git a/egs/timit/s4/local/timit_prep_flists.sh b/egs/timit/s4/local/timit_prep_flists.sh deleted file mode 100755 index c7f969f6b6e..00000000000 --- a/egs/timit/s4/local/timit_prep_flists.sh +++ /dev/null @@ -1,121 +0,0 @@ -#!/bin/bash -u - -# Copyright 2012 Arnab Ghoshal - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - -set -o errexit -set -o pipefail - -function read_dirname () { - local dir_name=`expr "X$1" : '[^=]*=\(.*\)'`; - [ -d "$dir_name" ] || { echo "Argument '$dir_name' not a directory" >&2; \ - exit 1; } - local retval=`cd $dir_name 2>/dev/null && pwd || exit 1` - echo $retval -} - -PROG=`basename $0`; -usage="Usage: $PROG \n -Prepare train, dev, test file lists for TIMIT.\n\n -Required arguments:\n - --corpus-dir=DIR\tDirectory for the TIMIT corpus\n - --dev-spk=FILE\tDevelopment set speaker list\n - --test-spk=FILE\tCore test set speaker list\n - --work-dir=DIR\t\tPlace to write the files (in a subdirectory with the 2-letter language code)\n -"; - -if [ $# -lt 3 ]; then - echo -e $usage; exit 1; -fi - -while [ $# -gt 0 ]; -do - case "$1" in - --help) echo -e $usage; exit 0 ;; - --corpus-dir=*) - CORPUS=`read_dirname $1`; shift ;; - --dev-spk=*) - DEVSPK=`expr "X$1" : '[^=]*=\(.*\)'`; shift ;; - --test-spk=*) - TESTSPK=`expr "X$1" : '[^=]*=\(.*\)'`; shift ;; - --work-dir=*) - WDIR=`read_dirname $1`; shift ;; - *) echo "Unknown argument: $1, exiting"; echo -e $usage; exit 1 ;; - esac -done - -if [ ! -d "$CORPUS/train" -a ! -d "$CORPUS/TRAIN" ]; then - echo "Expecting directory $CORPUS/train or $CORPUS/TRAIN to exist." - exit 1; -fi - -tmpdir=$(mktemp -d /tmp/kaldi.XXXX); -trap 'rm -rf "$tmpdir"' EXIT - -# Get the list of speakers. The list of speakers in the 24-speaker core test -# set and the 50-speaker development set must be supplied to the script. All -# speakers in the 'train' directory are used for training. -tr '[:upper:]' '[:lower:]' < $DEVSPK > $tmpdir/dev_spk # Just in case! -tr '[:upper:]' '[:lower:]' < $TESTSPK > $tmpdir/test_spk # Just in case! - -ls -d "$CORPUS"/train/dr*/* | sed -e "s:^.*/::" > $tmpdir/train_spk - - -ODIR=$WDIR/local # Directory to write file lists & transcripts -mkdir -p $ODIR - -for x in train dev test; do - # First, find the list of audio files (use only si & sx utterances). - # Note: train & test sets are under different directories, but doing find on - # both and grepping for the speakers will work correctly. - find $CORPUS/{train,test} -not \( -name 'sa*' \) -name '*.wav' \ - | grep -f $tmpdir/${x}_spk > $ODIR/${x}_sph.flist - sed -e 's:.*/\(.*\)/\(.*\).wav$:\1_\2:' $ODIR/${x}_sph.flist \ - > $tmpdir/${x}_sph.uttids - paste $tmpdir/${x}_sph.uttids $ODIR/${x}_sph.flist \ - | sort -k1,1 > $ODIR/${x}_sph.scp - - # Now, get the transcripts: each line of the output contains an utterance - # ID followed by the transcript. - find $CORPUS/{train,test} -not \( -name 'sa*' \) -name '*.phn' \ - | grep -f $tmpdir/${x}_spk > $tmpdir/${x}_phn.flist - sed -e 's:.*/\(.*\)/\(.*\).phn$:\1_\2:' $tmpdir/${x}_phn.flist \ - > $tmpdir/${x}_phn.uttids - while read line; do - [ -f $line ] || error_exit "Cannot find transcription file '$line'"; - cut -f3 -d' ' "$line" | tr '\n' ' ' | sed -e 's: *$:\n:' - done < $tmpdir/${x}_phn.flist > $tmpdir/${x}_phn.trans - paste $tmpdir/${x}_phn.uttids $tmpdir/${x}_phn.trans \ - | sort -k1,1 > $ODIR/${x}.trans - - # # Intersect the set of utterances with transcripts with the set of those - # # with valid audio. - # cut -f1 $tmpdir/${x}.trans \ - # | join $tmpdir/${x}_basenames_wav2 - > $tmpdir/${x}_basenames - # # Get the common set of WAV files and transcripts. - # join $tmpdir/${x}_basenames $tmpdir/${x}_wav.scp \ - # > $ODIR/${x}_wav.scp - # join $tmpdir/${x}_basenames $tmpdir/${x}.trans \ - # > $ODIR/${x}.trans - - awk '{printf("%s sph2pipe -f wav %s |\n", $1, $2);}' < $ODIR/${x}_sph.scp \ - > $ODIR/${x}_wav.scp - - sed -e 's:_.*$::' $tmpdir/${x}_sph.uttids \ - | paste -d' ' $tmpdir/${x}_sph.uttids - | sort -k1,1 \ - > $ODIR/${x}.utt2spk - utt2spk_to_spk2utt.pl $ODIR/${x}.utt2spk \ - > $ODIR/${x}.spk2utt; -done diff --git a/egs/timit/s4/path.sh b/egs/timit/s4/path.sh deleted file mode 100644 index 0167f6d038b..00000000000 --- a/egs/timit/s4/path.sh +++ /dev/null @@ -1,35 +0,0 @@ -# This contains the locations of the tools and data required for running -# the TIMIT experiments. - -# The KALDIROOT enviromnent variable must be set by the user. -# KALDIROOT=/absolute/path/to/kaldi/installation -[ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh -KALDISRC=$KALDIROOT/src -KALDIBIN=$KALDISRC/bin:$KALDISRC/featbin:$KALDISRC/fgmmbin:$KALDISRC/fstbin -KALDIBIN=$KALDIBIN:$KALDISRC/gmmbin:$KALDISRC/latbin:$KALDISRC/nnetbin -KALDIBIN=$KALDIBIN:$KALDISRC/sgmmbin:$KALDISRC/lm - -FSTBIN=$KALDIROOT/tools/openfst/bin -LMBIN=$KALDIROOT/tools/irstlm/bin - -[ -d $PWD/local ] || { echo "Expecting 'local' subdirectory"; exit 1; } -[ -d $PWD/utils ] || { echo "Expecting 'utils' subdirectory"; exit 1; } -[ -d $PWD/steps ] || { echo "Expecting 'steps' subdirectory"; exit 1; } - -LOCALUTILS=$PWD/local -KALDIUTILS=$PWD/utils -KALDISTEPS=$PWD/steps -SCRIPTS=$LOCALUTILS:$KALDIUTILS:$KALDISTEPS - -# If you already have shorten and sox on your path, comment the following out. -# Else use install.sh to install them first in the specified locations. -SPH2PIPE=$KALDIROOT/tools/sph2pipe_v2.5 -[ -x $SPH2PIPE/sph2pipe ] || { echo "Cannot find sph2pipe executable"; } -TOOLS=$SPH2PIPE - -export PATH=$PATH:$KALDIBIN:$FSTBIN:$LMBIN:$SCRIPTS:$TOOLS -export LC_ALL=C - -## Site-specific configs for Edinburgh -# [ `hostname -y` == ecdf ] && \ -# { . /etc/profile.d/modules.sh; module add intel/mkl; } diff --git a/egs/timit/s4/run.sh b/egs/timit/s4/run.sh deleted file mode 100755 index 7b6f25eedaa..00000000000 --- a/egs/timit/s4/run.sh +++ /dev/null @@ -1,81 +0,0 @@ -#!/bin/bash -u - -# Copyright 2012 Arnab Ghoshal - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - -exit 1; -# This script shows the steps needed to build a phone recognizer for TIMIT. - -# This recipe follows the setup first described in: -# K. F. Lee and H. W. Hon, "Speaker-independent phone recognition using hidden Markov models," 1988 -# where the training set is mapped to 48 phones and the results are presented -# on a 39-phone subset of that. - -# Set WORKDIR to someplace with enough disk space. That is where MFCCs will -# get created, as well as the LM in ARPA & FST formats. -WORKDIR=/path/with/disk/space -mkdir -p $WORKDIR -cp -r conf local utils steps path.sh $WORKDIR -cd $WORKDIR -. path.sh -[ -z "$KALDIROOT" ] && echo "ERROR: Must specify the KALDIROOT env varaible" && exit 1; - -local/timit_data_prep.sh --config-dir=$PWD/conf --corpus-dir=/path/to/TIMIT --work-dir=$WORKDIR - -local/timit_format_data.sh --hmm-proto=conf/topo.proto --work-dir=$PWD - -# Now make MFCC features. -mfccdir=$WORKDIR/data/MFCC -for x in train dev test; do - steps/make_mfcc.sh --num-jobs 6 data/$x exp/make_mfcc/$x $mfccdir -done - -decode_cmd="qsub -q all.q@@blade -l ram_free=500M,mem_free=500M" -train_cmd="qsub -q all.q@@blade -l ram_free=200M,mem_free=200M" - -steps/train_mono.sh --num-jobs 10 --qcmd "$train_cmd" \ - data/train data/lang exp/mono -utils/mkgraph.sh --mono data/lang_test_phone_bg exp/mono exp/mono/graph_bg -steps/decode_deltas.sh --accwt 1.0 --beam 20.0 --latgen --num-jobs 6 \ - --qcmd "$decode_cmd" exp/mono/graph_bg data/dev exp/mono/decode_dev_bg -utils/score_lats.sh exp/mono/decode_dev_bg exp/mono/graph_bg/words.txt \ - data/dev conf/phones.60-48-39.map -opt_accwt=`grep WER exp/mono/decode_dev_bg/wer_* \ - | sed -e 's?.*wer_??' -e 's?:%WER??' -e 's?\[.*??' | sort -k2,2 -g \ - | head -1 | awk '{print 1/$1}'` -steps/decode_deltas.sh --accwt $opt_accwt --beam 20.0 --num-jobs 4 \ - --qcmd "$decode_cmd" exp/mono/graph_bg data/test exp/mono/decode_test_bg -utils/score_text.sh exp/mono/decode_test_bg exp/mono/graph_bg/words.txt \ - data/test conf/phones.60-48-39.map - -steps/align_deltas.sh --num-jobs 10 --qcmd "$train_cmd" \ - data/train data/lang exp/mono exp/mono_ali - -steps/train_deltas.sh --num-jobs 10 --qcmd "$train_cmd" \ - 2000 10000 data/train data/lang exp/mono_ali exp/tri1 - -utils/mkgraph.sh data/lang_test_phone_bg exp/tri1 exp/tri1/graph_bg -steps/decode_deltas.sh --accwt 1.0 --beam 20.0 --latgen --num-jobs 6 \ - --qcmd "$decode_cmd" exp/tri1/graph_bg data/dev exp/tri1/decode_dev_bg -utils/score_lats.sh exp/tri1/decode_dev_bg exp/tri1/graph_bg/words.txt \ - data/dev conf/phones.60-48-39.map -opt_accwt=`grep WER exp/tri1/decode_dev_bg/wer_* \ - | sed -e 's?.*wer_??' -e 's?:%WER??' -e 's?\[.*??' | sort -k2,2 -g \ - | head -1 | awk '{print 1/$1}'` -steps/decode_deltas.sh --accwt $opt_accwt --beam 20.0 --num-jobs 4 \ - --qcmd "$decode_cmd" exp/tri1/graph_bg data/test exp/tri1/decode_test_bg -utils/score_text.sh exp/tri1/decode_test_bg exp/tri1/graph_bg/words.txt \ - data/test conf/phones.60-48-39.map - diff --git a/egs/timit/s4/steps/align_deltas.sh b/egs/timit/s4/steps/align_deltas.sh deleted file mode 100755 index 89cba6192ae..00000000000 --- a/egs/timit/s4/steps/align_deltas.sh +++ /dev/null @@ -1,138 +0,0 @@ -#!/bin/bash - -# Copyright 2010-2012 Microsoft Corporation; Arnab Ghoshal - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - -# To be run from .. - -# This script does training-data alignment given a model built using -# CMN + delta + delta-delta features. It splits the data into -# four chunks and does everything in parallel on the same machine. -# Its output, all in its own experimental directory, is (assuming -# you don't change the #jobs with --num-job option), -# {0,1,2,3}.cmvn {0,1,2,3}.ali.gz, tree, final.mdl -# and final.occs (the last three are just copied from the source directory). - - -# Option to use precompiled graphs from last phase, if these -# are available (i.e. if they were built with the same data). -# These must be split into four pieces. - -function error_exit () { - echo -e "$@" >&2; exit 1; -} - -function readint () { - local retval=${1/#*=/}; # In case --switch=ARG format was used - retval=${retval#0*} # Strip any leading 0's - [[ "$retval" =~ ^-?[1-9][0-9]*$ ]] \ - || error_exit "Argument \"$retval\" not an integer." - echo $retval -} - -njobs=4 # Default number of jobs -qcmd="" # Options for the submit_jobs.sh script -oldgraphs=false - -PROG=`basename $0`; -usage="Usage: $PROG [options] \n -e.g.: $PROG data/train data/lang exp/tri1 exp/tri1_ali\n\n -Options:\n - --help\t\tPrint this message and exit\n - --num-jobs INT\tNumber of parallel jobs to run (default=$njobs).\n - --qcmd STRING\tCommand for submitting a job to a grid engine (e.g. qsub) including switches.\n - --use-graphs\tReuse older graphs\n -"; - -while [ $# -gt 0 ]; do - case "${1# *}" in # ${1# *} strips any leading spaces from the arguments - --help) echo -e $usage; exit 0 ;; - --num-jobs) - shift; njobs=`readint $1`; - [ $njobs -lt 1 ] && error_exit "--num-jobs arg '$njobs' not positive."; - shift ;; - --qcmd) - shift; qcmd=" --qcmd=${1}"; shift ;; - --use-graphs) - oldgraphs=true; shift ;; - -*) echo "Unknown argument: $1, exiting"; echo -e $usage; exit 1 ;; - *) break ;; # end of options: interpreted as the data-dir - esac -done - -if [ $# != 4 ]; then - error_exit $usage; -fi - -[ -f path.sh ] && . path.sh - -data=$1 -lang=$2 -srcdir=$3 -dir=$4 - -if [ -f $lang/oov.txt ]; then - oov_opt="--map-oov '"`cat $lang/oov.txt`"'" -else - oov_opt='--ignore-oov' -fi - -mkdir -p $dir -# Create copy of the tree and model and occs... -cp $srcdir/{tree,final.mdl,final.occs} $dir || exit 1; - -scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1" - -if [ ! -d $data/split$njobs -o $data/split$njobs -ot $data/feats.scp ]; then - split_data.sh $data $njobs -fi - -echo "Computing cepstral mean and variance statistics" -# for n in `get_splits.pl $njobs`; do # Do this locally; it's fast. -submit_jobs.sh "$qcmd" --njobs=$njobs --log=$dir/cmvnTASK_ID.log \ - compute-cmvn-stats --spk2utt=ark:$data/split$njobs/TASK_ID/spk2utt \ - scp:$data/split$njobs/TASK_ID/feats.scp ark:$dir/TASK_ID.cmvn \ - || error_exit "Computing CMN/CVN stats failed."; - - -# Align all training data using the supplied model. -echo "Aligning data from $data" -feats="ark:apply-cmvn --norm-vars=false --utt2spk=ark:$data/utt2spk ark:$dir/TASK_ID.cmvn scp:$data/split$njobs/TASK_ID/feats.scp ark:- | add-deltas ark:- ark:- |" - -if $oldgraphs; then - # for n in `get_splits.pl $njobs`; do - # feats="ark:apply-cmvn --norm-vars=false --utt2spk=ark:$data/utt2spk ark:$dir/TASK_ID.cmvn scp:$data/split$njobs/TASK_ID/feats.scp ark:- | add-deltas ark:- ark:- |" - ls $srcdir/{1..$njobs}.fsts.gz >/dev/null \ - || error_exit "Missing FSTs with --use-graphs option specified." - submit_jobs.sh "$qcmd" --njobs=$njobs --log=$dir/alignTASK_ID.log \ - gmm-align-compiled $scale_opts --beam=10 --retry-beam=40 $dir/final.mdl \ - "ark:gunzip -c $srcdir/TASK_ID.fsts.gz|" "$feats" "ark:|gzip -c >$dir/TASK_ID.ali.gz" \ - || error_exit "Error doing alignment."; - -else - # for n in `get_splits.pl $njobs`; do - # feats="ark:apply-cmvn --norm-vars=false --utt2spk=ark:$data/utt2spk ark:$dir/TASK_ID.cmvn scp:$data/split$njobs/TASK_ID/feats.scp ark:- | add-deltas ark:- ark:- |" - # compute integer form of transcripts. - tra="ark:sym2int.pl $oov_opt --ignore-first-field $lang/words.txt $data/split$njobs/TASK_ID/text|"; - # We could just use gmm-align in the next line, but it's less efficient as - # it compiles the training graphs one by one. - submit_jobs.sh "$qcmd" --njobs=$njobs --log=$dir/alignTASK_ID.log \ - compile-train-graphs $dir/tree $dir/final.mdl $lang/L.fst "$tra" ark:- \| \ - gmm-align-compiled $scale_opts --beam=10 --retry-beam=40 $dir/final.mdl \ - ark:- "$feats" "ark:|gzip -c >$dir/TASK_ID.ali.gz" \ - || error_exit "Error doing alignment."; -fi - -echo "Done aligning data." diff --git a/egs/timit/s4/steps/decode_deltas.sh b/egs/timit/s4/steps/decode_deltas.sh deleted file mode 100755 index 5d5594fd981..00000000000 --- a/egs/timit/s4/steps/decode_deltas.sh +++ /dev/null @@ -1,125 +0,0 @@ -#!/bin/bash -u - -# Copyright 2012 Arnab Ghoshal -# Copyright 2010-2011 Microsoft Corporation - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - -# Decoding script that works with a GMM model and delta-delta plus -# cepstral mean subtraction features. Used, for example, to decode -# mono/ and tri1/ -# This script just generates lattices for a single broken-up -# piece of the data. - -function error_exit () { - echo -e "$@" >&2; exit 1; -} - -function readfloat () { - local retval=${1/#*=/}; # In case --switch=ARG format was used - [[ "$retval" =~ ^-?[0-9]*\.*[0-9]*$ ]] \ - || error_exit "Argument \"$retval\" not a real number." - echo $retval -} - -function readint () { - local retval=${1/#*=/}; # In case --switch=ARG format was used - retval=${retval#0*} # Strip any leading 0's - [[ "$retval" =~ ^-?[1-9][0-9]*$ ]] \ - || error_exit "Argument \"$retval\" not an integer." - echo $retval -} - -accwt=1.0 -beam=30.0 -latgen=0 -njobs=4 -qcmd="" # Options for the submit_jobs.sh script - -PROG=`basename $0`; -usage="Usage: $PROG [options] \n -e.g.: $PROG exp/mono/graph_bg data/dev exp/mono/decode_dev_bg\n\n -Options:\n - --help\t\tPrint this message and exit\n - --accwt FLOAT\tScaling for acoustic likelihoods (default=$accwt).\n - --beam FLOAT\tDecoder beam (default=$beam)\n - --latgen\tGenerate lattices (off by default)\n - --num-jobs INT\tNumber of parallel jobs to run (default=$njobs).\n - --qcmd STRING\tCommand for submitting a job to a grid engine (e.g. qsub) including switches.\n -"; - -while [ $# -gt 0 ]; do - case "${1# *}" in # ${1# *} strips any leading spaces from the arguments - --help) echo -e $usage; exit 0 ;; - --accwt) - shift; accwt=`readfloat $1`; shift ;; - --beam) - shift; beam=`readfloat $1`; shift ;; - --latgen) shift; latgen=1 ;; - --num-jobs) - shift; njobs=`readint $1`; - [ $njobs -lt 1 ] && error_exit "--num-jobs arg '$njobs' not positive."; - shift ;; - --qcmd) - shift; qcmd="--qcmd=${1}"; shift ;; - -*) error_exit "Unknown argument: $1, exiting\n$usage" ;; - *) break ;; # end of options: interpreted as the data-dir - esac -done - -if [ $# != 3 ]; then - error_exit $usage; -fi - -[ -f path.sh ] && . path.sh - -graphdir=$1 -data=$2 -dir=$3 -srcdir=`dirname $dir`; # Assume model directory one level up from decoding directory. - -mkdir -p $dir - -requirements="$data/feats.scp $srcdir/final.mdl $graphdir/HCLG.fst" -for f in $requirements; do - if [ ! -f $f ]; then - echo "decode_deltas.sh: no such file $f"; - exit 1; - fi -done - -# We only do one decoding pass, so there is no point caching the -# CMVN stats-- we make them part of a pipe. -feats="ark:compute-cmvn-stats --spk2utt=ark:$data/spk2utt scp:$data/feats.scp ark:- | apply-cmvn --norm-vars=false --utt2spk=ark:$data/utt2spk ark:- scp:$data/feats.scp ark:- | add-deltas ark:- ark:- |" -if [ $njobs -gt 1 ]; then - if [ ! -d $data/split$njobs -o $data/split$njobs -ot $data/feats.scp ]; then - split_data.sh $data $njobs - fi - mydata=$data/split$njobs/TASK_ID - feats="ark:compute-cmvn-stats --spk2utt=ark:$mydata/spk2utt scp:$mydata/feats.scp ark:- | apply-cmvn --norm-vars=false --utt2spk=ark:$mydata/utt2spk ark:- scp:$mydata/feats.scp ark:- | add-deltas ark:- ark:- |" -fi - -if [ $latgen -eq 1 ]; then - submit_jobs.sh "$qcmd" --njobs=$njobs --log=$dir/decode.TASK_ID.log \ - gmm-latgen-faster --max-active=7000 --beam=$beam --lattice-beam=6.0 \ - --acoustic-scale=$accwt --word-symbol-table=$graphdir/words.txt \ - $srcdir/final.mdl $graphdir/HCLG.fst "$feats" \ - "ark:|gzip -c > $dir/lat.TASK_ID.gz" || error_exit "Decoding failed."; -else - submit_jobs.sh "$qcmd" --njobs=$njobs --log=$dir/decode.TASK_ID.log \ - gmm-decode-faster --beam=$beam --acoustic-scale=$accwt \ - --word-symbol-table=$graphdir/words.txt $srcdir/final.mdl \ - $graphdir/HCLG.fst "$feats" ark,t:$dir/test.TASK_ID.tra \ - || error_exit "Decoding failed."; -fi diff --git a/egs/timit/s4/steps/make_mfcc.sh b/egs/timit/s4/steps/make_mfcc.sh deleted file mode 100755 index 7033f0a1a42..00000000000 --- a/egs/timit/s4/steps/make_mfcc.sh +++ /dev/null @@ -1,111 +0,0 @@ -#!/bin/bash -u - -# Copyright 2012 Arnab Ghoshal -# Copyright 2010-2011 Microsoft Corporation - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - -# To be run from .. (one directory up from here) - -function error_exit () { - echo -e "$@" >&2; exit 1; -} - -function readint () { - local retval=${1/#*=/}; # In case --switch=ARG format was used - retval=${retval#0*} # Strip any leading 0's - [[ "$retval" =~ ^-?[1-9][0-9]*$ ]] \ - || error_exit "Argument \"$retval\" not an integer." - echo $retval -} - -njobs=4 # Default number of jobs -stage=-4 # Default starting stage (start with calculating CMN/CVN stats) -qcmd="" # Options for the submit_jobs.sh script - -PROG=`basename $0`; -usage="Usage: $PROG [options] \n\n -Options:\n - --help\t\tPrint this message and exit\n - --num-jobs INT\tNumber of parallel jobs to run (default=$njobs).\n - --qcmd STRING\tCommand for submitting a job to a grid engine (e.g. qsub) including switches.\n -"; - -while [ $# -gt 0 ]; do - case "${1# *}" in # ${1# *} strips any leading spaces from the arguments - --help) echo -e $usage; exit 0 ;; - --num-jobs) - shift; njobs=`readint $1`; - [ $njobs -lt 1 ] && error_exit "--num-jobs arg '$njobs' not positive."; - shift ;; - --qcmd) - shift; qcmd="--qcmd=${1}"; shift ;; - -*) echo "Unknown argument: $1, exiting"; echo -e $usage; exit 1 ;; - *) break ;; # end of options: interpreted as the data-dir - esac -done - -if [ $# != 3 ]; then - error_exit $usage; -fi - -[ -f path.sh ] && . path.sh - -data=$1 -logdir=$2 -mfccdir=$3 - -# use "name" as part of name of the archive. -name=`basename $data` - -mkdir -p $mfccdir || exit 1; -mkdir -p $logdir || exit 1; - -scp=$data/wav.scp -config=conf/mfcc.conf -required="$scp $config" - -for f in $required; do - if [ ! -f $f ]; then - echo "make_mfcc.sh: no such file $f" - exit 1; - fi -done - -# note: in general, the double-parenthesis construct in bash "((" is "C-style -# syntax" where we can get rid of the $ for variable names, and omit spaces. -# The "for" loop in this style is a special construct. - -split_scps="" -for ((n=1; n<=njobs; n++)); do - split_scps="$split_scps $logdir/wav$n.scp" -done - -split_scp.pl $scp $split_scps || exit 1; - -rm -f $logdir/.error.$name 2>/dev/null -submit_jobs.sh "$qcmd" --njobs=$njobs --log=$logdir/make_mfcc.TASK_ID.log \ - compute-mfcc-feats --verbose=2 --config=$config scp:$logdir/wavTASK_ID.scp \ - ark,scp:$mfccdir/mfcc_$name.TASK_ID.ark,$mfccdir/mfcc_$name.TASK_ID.scp \ - || error_exit "Error producing mfcc features for $name:"`tail $logdir/make_mfcc.*.log` - -# concatenate the .scp files together. -rm $data/feats.scp 2>/dev/null -for ((n=1; n<=njobs; n++)); do - cat $mfccdir/mfcc_$name.$n.scp >> $data/feats.scp -done - -# rm $logdir/wav*.scp - -echo "Succeeded creating MFCC features for $name" diff --git a/egs/timit/s4/steps/train_deltas.sh b/egs/timit/s4/steps/train_deltas.sh deleted file mode 100755 index 9101d89c9f4..00000000000 --- a/egs/timit/s4/steps/train_deltas.sh +++ /dev/null @@ -1,256 +0,0 @@ -#!/bin/bash - -# Copyright 2010-2012 Microsoft Corporation; Arnab Ghoshal - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - -# To be run from .. -# Triphone model training, using (e.g. MFCC) + delta + acceleration features and -# cepstral mean normalization. It starts from an existing directory (e.g. -# exp/mono), supplied as an argument, which is assumed to be built using the same -# type of features. -# -# This script starts from previously generated state-level alignments -# (in $alidir), e.g. generated by a previous monophone or triphone -# system. To build a context-dependent triphone system, we build -# decision trees that map a 3-phone phonetic context window to a -# pdf index. It's not really clear which is the right reference, but -# on is "Tree-based state tying for high accuracy acoustic modelling" -# by Steve Young et al. -# In a typical approach, there are decision trees for -# each monophone HMM-state (i.e. 3 per phone), and each one gets to -# ask questions about the left and right phone. These questions -# correspond to sets of phones, corresponding to phonetic classes -# (e.g. vowel, consonant, liquid, solar, ... ). In Kaldi, we prefer -# fully automatic algorithms, and anyway we're not sure where to get -# these types of lists, so we just generate the classes automatically. -# This is based on a top-down binary tree clustering of the phones -# (see "cluster-phones"), where we take single-Gaussian statistics for -# just the central state of each phone (assuming this to be more -# representative of the phones), and we get a tree structure on the -# phones; each class corresponds to a node of the tree (it contains all -# the phones that are children of that node). Note: you could -# replace questions.txt with something derived from manually written -# questions. -# Also, the roots of the tree correspond to classes of phones (typically -# corresponding to "real phones", because the actual phones may contain -# word-begin/end and stress information), and the tree gets to ask -# questions also about the central phone, and about the state in the HMM. -# After building the tree, we do a number of iterations of Gaussian -# Mixture Model training; on selected iterations we redo the Viterbi -# alignments (initially, these are taken from the previous system). -# The Gaussian mixture splitting, whereby we go from a single Gaussian -# per state to multiple Gaussians, is done on all iterations (although -# we stop doing this a few iterations before the end). We don't have -# a fixed number of Gaussians per state, but we have an overall target -# #Gaussians that's specified on each iteration, and we allocate -# the Gaussians among states according to a power-law where the #Gaussians -# is proportional to the count to the power 0.2. The target -# increases linearly during training [note: logarithmically seems more -# natural but didn't work as well.] - -function error_exit () { - echo -e "$@" >&2; exit 1; -} - -function readint () { - local retval=${1/#*=/}; # In case --switch=ARG format was used - retval=${retval#0*} # Strip any leading 0's - [[ "$retval" =~ ^-?[1-9][0-9]*$ ]] \ - || error_exit "Argument \"$retval\" not an integer." - echo $retval -} - -njobs=4 # Default number of jobs -stage=-4 # Default starting stage (start with tree building) -qcmd="" # Options for the submit_jobs.sh script - -PROG=`basename $0`; -usage="Usage: $PROG [options] \n -e.g.: $PROG 2000 10000 data/train_si84 data/lang exp/mono_ali exp/tri1\n\n -Options:\n - --help\t\tPrint this message and exit\n - --num-jobs INT\tNumber of parallel jobs to run (default=$njobs).\n - --qcmd STRING\tCommand for submitting a job to a grid engine (e.g. qsub) including switches.\n - --stage INT\tStarting stage (e.g. -4 for tree building; 2 for iter 2; default=$stage)\n -"; - -while [ $# -gt 0 ]; do - case "${1# *}" in # ${1# *} strips any leading spaces from the arguments - --help) echo -e $usage; exit 0 ;; - --num-jobs) - shift; njobs=`readint $1`; - [ $njobs -lt 1 ] && error_exit "--num-jobs arg '$njobs' not positive."; - shift ;; - --qcmd) - shift; qcmd=" --qcmd=${1}"; shift ;; - --stage) - shift; stage=`readint $1`; shift ;; - -*) echo "Unknown argument: $1, exiting"; echo -e $usage; exit 1 ;; - *) break ;; # end of options: interpreted as num-leaves - esac -done - -if [ $# != 6 ]; then - error_exit $usage; -fi - -[ -f path.sh ] && . path.sh - -numleaves=$1 -totgauss=$2 -data=$3 -lang=$4 -alidir=$5 -dir=$6 - -if [ ! -f $alidir/final.mdl ]; then - echo "Error: alignment dir $alidir does not contain final.mdl" - exit 1; -fi - -scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1" -realign_iters="10 20 30"; -silphonelist=`cat $lang/silphones.csl` -numiters=35 # Number of iterations of training -maxiterinc=25 # Last iter to increase #Gauss on. -numgauss=$numleaves -incgauss=$[($totgauss-$numgauss)/$maxiterinc] # per-iter increment for #Gauss - -if [ -f $lang/oov.txt ]; then - oov_opt="--map-oov '"`cat $lang/oov.txt`"'" -else - oov_opt='--ignore-oov' -fi - -mkdir -p $dir/log -if [ ! -d $data/split$njobs -o $data/split$njobs -ot $data/feats.scp ]; then - split_data.sh $data $njobs -fi - -# for n in `get_splits.pl $njobs`; do -featspart="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$data/split$njobs/TASK_ID/utt2spk ark:$alidir/TASK_ID.cmvn scp:$data/split$njobs/TASK_ID/feats.scp ark:- | add-deltas ark:- ark:- |" - -if [ $stage -le -3 ]; then -# The next stage assumes we won't need the context of silence, which -# assumes something about $lang/roots.txt, but it seems pretty safe. - echo "Accumulating tree stats" - # for n in `get_splits.pl $njobs`; do - submit_jobs.sh "$qcmd" --njobs=$njobs --log=$dir/log/acc_tree.TASK_ID.log \ - acc-tree-stats --ci-phones=$silphonelist $alidir/final.mdl "$featspart" \ - "ark:gunzip -c $alidir/TASK_ID.ali.gz|" $dir/TASK_ID.treeacc \ - || error_exit "Error accumulating tree stats"; - - sum-tree-stats $dir/treeacc $dir/*.treeacc 2>$dir/log/sum_tree_acc.log \ - || error_exit "Error summing tree stats."; - rm $dir/*.treeacc -fi - -if [ $stage -le -2 ]; then -# preparing questions, roots file... - echo "Computing questions for tree clustering" - ( sym2int.pl $lang/phones.txt $lang/phonesets_cluster.txt > $dir/phonesets.txt - cluster-phones $dir/treeacc $dir/phonesets.txt $dir/questions.txt \ - 2> $dir/log/questions.log - [ -f $lang/extra_questions.txt ] && \ - sym2int.pl $lang/phones.txt $lang/extra_questions.txt \ - >> $dir/questions.txt - compile-questions $lang/topo $dir/questions.txt $dir/questions.qst \ - 2>$dir/log/compile_questions.log - sym2int.pl --ignore-oov $lang/phones.txt $lang/roots.txt > $dir/roots.txt - ) || error_exit "Error in generating questions for tree clustering." - - echo "Building tree" - submit_jobs.sh "$qcmd" --log=$dir/log/train_tree.log \ - build-tree --verbose=1 --max-leaves=$numleaves $dir/treeacc $dir/roots.txt \ - $dir/questions.qst $lang/topo $dir/tree \ - || error_exit "Error in building tree."; - - gmm-init-model --write-occs=$dir/1.occs \ - $dir/tree $dir/treeacc $lang/topo $dir/1.mdl 2> $dir/log/init_model.log \ - || error_exit "Error in initializing the model."; - - gmm-mixup --mix-up=$numgauss $dir/1.mdl $dir/1.occs $dir/1.mdl \ - 2>$dir/log/mixup.log || error_exit "Error mixing up to $numgauss Gaussains"; - - rm $dir/treeacc -fi - - -if [ $stage -le -1 ]; then -# Convert alignments in $alidir, to use as initial alignments. -# This assumes that $alidir was split in $njobs pieces, just like the -# current dir. Just do this locally-- it's very fast. - echo "Converting old alignments" - # for n in `get_splits.pl $njobs`; do - submit_jobs.sh --njobs=$njobs --log=$dir/log/convertTASK_ID.log \ - convert-ali $alidir/final.mdl $dir/1.mdl $dir/tree \ - "ark:gunzip -c $alidir/TASK_ID.ali.gz|" \ - "ark:|gzip -c >$dir/TASK_ID.ali.gz" \ - || error_exit "Error converting old alignments."; -fi - -if [ $stage -le 0 ]; then -# Make training graphs (this is split in $njobs parts). - echo "Compiling training graphs" - # for n in `get_splits.pl $njobs`; do - submit_jobs.sh "$qcmd" --njobs=$njobs --log=$dir/log/compile_graphsTASK_ID.log \ - compile-train-graphs $dir/tree $dir/1.mdl $lang/L.fst \ - "ark:sym2int.pl $oov_opt --ignore-first-field $lang/words.txt < $data/split$njobs/TASK_ID/text |" \ - "ark:|gzip -c >$dir/TASK_ID.fsts.gz" \ - || error_exit "Error compiling training graphs"; -fi - -x=1 -while [ $x -lt $numiters ]; do - echo Pass $x - if [ $stage -le $x ]; then - if echo $realign_iters | grep -w $x >/dev/null; then - echo "Aligning data" - # for n in `get_splits.pl $njobs`; do - submit_jobs.sh "$qcmd" --njobs=$njobs --log=$dir/log/align.$x.TASK_ID.log \ - gmm-align-compiled $scale_opts --beam=10 --retry-beam=40 $dir/$x.mdl \ - "ark:gunzip -c $dir/TASK_ID.fsts.gz|" "$featspart" \ - "ark:|gzip -c >$dir/TASK_ID.ali.gz" \ - || error_exit "Error aligning data on iteration $x"; - fi # Realign iters - - # for n in `get_splits.pl $njobs`; do - submit_jobs.sh "$qcmd" --njobs=$njobs --log=$dir/log/acc.$x.TASK_ID.log \ - gmm-acc-stats-ali $dir/$x.mdl "$featspart" \ - "ark,s,cs:gunzip -c $dir/TASK_ID.ali.gz|" $dir/$x.TASK_ID.acc \ - || error_exit "Error accumulating stats on iteration $x"; - - submit_jobs.sh "$qcmd" --log=$dir/log/update.$x.log \ - gmm-est --write-occs=$dir/$[$x+1].occs --mix-up=$numgauss $dir/$x.mdl \ - "gmm-sum-accs - $dir/$x.*.acc |" $dir/$[$x+1].mdl \ - || error_exit "Error in pass $x extimation."; - rm -f r/$x.mdl $dir/$x.*.acc rm $dir/$x.occs - fi # Completed a training stage. - if [[ $x -le $maxiterinc ]]; then - numgauss=$[$numgauss+$incgauss]; - fi - x=$[$x+1]; -done - -( cd $dir; rm -f final.{mdl,occs}; ln -s $x.mdl final.mdl; \ - ln -s $x.occs final.occs; ) - -# Print out summary of the warning messages. -for x in $dir/log/*.log; do - n=`grep WARNING $x | wc -l`; - if [ $n -ne 0 ]; then echo $n warnings in $x; fi; -done - -echo Done diff --git a/egs/timit/s4/steps/train_mono.sh b/egs/timit/s4/steps/train_mono.sh deleted file mode 100755 index b7dad23d7fe..00000000000 --- a/egs/timit/s4/steps/train_mono.sh +++ /dev/null @@ -1,202 +0,0 @@ -#!/bin/bash - -# Copyright 2012 Arnab Ghoshal -# Copyright 2010-2011 Microsoft Corporation - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - -# To be run from .. -# Flat start and monophone training, with delta-delta features. -# This script applies cepstral mean normalization (per speaker). - -function error_exit () { - echo -e "$@" >&2; exit 1; -} - -function readint () { - local retval=${1/#*=/}; # In case --switch=ARG format was used - retval=${retval#0*} # Strip any leading 0's - [[ "$retval" =~ ^-?[1-9][0-9]*$ ]] \ - || error_exit "Argument \"$retval\" not an integer." - echo $retval -} - -njobs=4 # Default number of jobs -stage=-4 # Default starting stage (start with calculating CMN/CVN stats) -qcmd="" # Options for the submit_jobs.sh script - -PROG=`basename $0`; -usage="Usage: $PROG [options] \n -e.g.: $PROG data/train.1k data/lang exp/mono\n\n -Options:\n - --help\t\tPrint this message and exit\n - --num-jobs INT\tNumber of parallel jobs to run (default=$njobs).\n - --qcmd STRING\tCommand for submitting a job to a grid engine (e.g. qsub) including switches.\n - --stage INT\tStarting stage (e.g. -4 for CMN/CVN stats; 2 for iter 2; default=$stage)\n -"; - -while [ $# -gt 0 ]; do - case "${1# *}" in # ${1# *} strips any leading spaces from the arguments - --help) echo -e $usage; exit 0 ;; - --num-jobs) - shift; njobs=`readint $1`; - [ $njobs -lt 1 ] && error_exit "--num-jobs arg '$njobs' not positive."; - shift ;; - --qcmd) - shift; qcmd="--qcmd=${1}"; shift ;; - --stage) - shift; stage=`readint $1`; shift ;; - -*) echo "Unknown argument: $1, exiting"; echo -e $usage; exit 1 ;; - *) break ;; # end of options: interpreted as the data-dir - esac -done - -if [ $# != 3 ]; then - error_exit $usage; -fi - -data=$1 -lang=$2 -dir=$3 - -[ -f path.sh ] && . path.sh - -# Configuration: -scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1" -numiters=40 # Number of iterations of training -maxiterinc=30 # Last iter to increase #Gauss on. -numgauss=300 # Initial num-Gauss (must be more than #states=3*phones). -totgauss=1000 # Target #Gaussians. -incgauss=$[($totgauss-$numgauss)/$maxiterinc] # per-iter increment for #Gauss -realign_iters="1 2 3 4 5 6 7 8 9 10 12 14 16 18 20 23 26 29 32 35 38"; -if [ -f $lang/oov.txt ]; then - oov_opt="--map-oov '"`cat $lang/oov.txt`"'" -else - oov_opt='--ignore-oov' -fi - -mkdir -p $dir/log -if [ ! -d $data/split$njobs -o $data/split$njobs -ot $data/feats.scp ]; then - split_data.sh $data $njobs -fi - -if [ $stage -le -3 ]; then - echo "Computing cepstral mean and variance statistics" - # for n in `get_splits.pl $njobs`; do # do this locally; it's fast. - submit_jobs.sh "$qcmd" --njobs=$njobs --log=$dir/log/cmvnTASK_ID.log \ - compute-cmvn-stats --spk2utt=ark:$data/split$njobs/TASK_ID/spk2utt \ - scp:$data/split$njobs/TASK_ID/feats.scp ark:$dir/TASK_ID.cmvn \ - || error_exit "Computing CMN/CVN stats failed."; -fi - -feats="ark:apply-cmvn --norm-vars=false --utt2spk=ark:$data/utt2spk \"ark:cat $dir/*.cmvn|\" scp:$data/feats.scp ark:- | add-deltas ark:- ark:- |" - -# for n in `get_splits.pl $njobs`; do -# for n in `seq 1 $njobs`; do -featspart="ark:apply-cmvn --norm-vars=false --utt2spk=ark:$data/split$njobs/TASK_ID/utt2spk ark:$dir/TASK_ID.cmvn scp:$data/split$njobs/TASK_ID/feats.scp ark:- | add-deltas ark:- ark:- |" - - -if [ $stage -le -2 ]; then - echo "Initializing monophone system." - if [ -f $lang/phonesets_mono.txt ]; then - echo "Using shared phones from $lang/phonesets_mono.txt" - # In recipes with stress and position markers, this pools together - # the stats for the different versions of the same phone (also for - # the various silence phones). - sym2int.pl $lang/phones.txt $lang/phonesets_mono.txt > $dir/phonesets.int - shared_phones_opt="--shared-phones=$dir/phonesets.int" - fi - - gmm-init-mono $shared_phones_opt \ - "--train-feats=$feats subset-feats --n=10 ark:- ark:-|" $lang/topo 39 \ - $dir/0.mdl $dir/tree 2> $dir/log/init.log \ - || error_exit "Monophone model initialization failed."; -fi - -if [ $stage -le -1 ]; then - echo "Compiling training graphs" - submit_jobs.sh "$qcmd" --njobs=$njobs --log=$dir/log/compile_graphsTASK_ID.log \ - compile-train-graphs $dir/tree $dir/0.mdl $lang/L.fst \ - "ark:sym2int.pl $oov_opt --ignore-first-field $lang/words.txt < $data/split$njobs/TASK_ID/text|" \ - "ark:|gzip -c >$dir/TASK_ID.fsts.gz" \ - || error_exit "Error compiling training graphs."; -fi - -if [ $stage -le 0 ]; then - echo "Aligning data equally (pass 0)" -# for n in `get_splits.pl $njobs`; do - submit_jobs.sh "$qcmd" --njobs=$njobs --log=$dir/log/align.0.TASK_ID.log \ - align-equal-compiled "ark:gunzip -c $dir/TASK_ID.fsts.gz|" "$featspart" \ - ark,t,f:- \| \ - gmm-acc-stats-ali --binary=true $dir/0.mdl "$featspart" \ - ark:- $dir/0.TASK_ID.acc \ - || error_exit "Error in pass 0 accumulation"; - -# In the following steps, the --min-gaussian-occupancy=3 option is important, -# otherwise we cannot est "rare" phones and later on, they never align properly. - gmm-est --min-gaussian-occupancy=3 --mix-up=$numgauss \ - $dir/0.mdl "gmm-sum-accs - $dir/0.*.acc|" $dir/1.mdl \ - 2> $dir/log/update.0.log || error_exit "Error in pass 0 estimation."; - - rm $dir/0.*.acc -fi # Finished 0'th training iteration. - -beam=6 # will change to 10 below after 1st pass -x=1 -while [ $x -lt $numiters ]; do - echo "Pass $x" - if [ $stage -le $x ]; then - if echo $realign_iters | grep -w $x >/dev/null; then - echo "Aligning data" - # for n in `get_splits.pl $njobs`; do - submit_jobs.sh "$qcmd" --njobs=$njobs --log=$dir/log/align.$x.TASK_ID.log \ - gmm-align-compiled $scale_opts --beam=$beam --retry-beam=$[$beam*4] \ - $dir/$x.mdl "ark:gunzip -c $dir/TASK_ID.fsts.gz|" "$featspart" \ - "ark,t:|gzip -c >$dir/TASK_ID.ali.gz" \ - || error_exit "Error in pass $x alignment."; - fi # Realign iters - - # for n in `get_splits.pl $njobs`; do - submit_jobs.sh "$qcmd" --njobs=$njobs --log=$dir/log/acc.$x.TASK_ID.log \ - gmm-acc-stats-ali $dir/$x.mdl "$featspart" \ - "ark:gunzip -c $dir/TASK_ID.ali.gz|" $dir/$x.TASK_ID.acc \ - || error_exit "Error in pass $x accumulation."; - - submit_jobs.sh "$qcmd" --log=$dir/log/update.$x.log \ - gmm-est --write-occs=$dir/$[$x+1].occs --mix-up=$numgauss $dir/$x.mdl \ - "gmm-sum-accs - $dir/$x.*.acc|" $dir/$[$x+1].mdl \ - || error_exit "Error in pass $x extimation."; - rm -f $dir/$x.mdl $dir/$x.*.acc $dir/$x.occs - fi # Completed a training stage. - if [ $x -le $maxiterinc ]; then - numgauss=$[$numgauss+$incgauss]; - fi - beam=10 - x=$[$x+1]; -done - -( cd $dir; rm -f final.{mdl,occs}; ln -s $x.mdl final.mdl; \ - ln -s $x.occs final.occs; ) - -# Print out summary of the warning messages. -for x in $dir/log/*.log; do - n=`grep WARNING $x | wc -l`; - if [ $n -ne 0 ]; then echo $n warnings in $x; fi; -done - -echo Done - -# example of showing the alignments: -# show-alignments data/lang/phones.txt $dir/30.mdl "ark:gunzip -c $dir/0.ali.gz|" | head -4 - diff --git a/egs/timit/s4/utils/add_disambig.pl b/egs/timit/s4/utils/add_disambig.pl deleted file mode 100755 index 962ef386763..00000000000 --- a/egs/timit/s4/utils/add_disambig.pl +++ /dev/null @@ -1,58 +0,0 @@ -#!/usr/bin/env perl -# Copyright 2010-2011 Microsoft Corporation - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - - -# Adds some specified number of disambig symbols to a symbol table. -# Adds these as #1, #2, etc. -# If the --include-zero option is specified, includes an extra one -# #0. - -$include_zero = 0; -if($ARGV[0] eq "--include-zero") { - $include_zero = 1; - shift @ARGV; -} - -if(@ARGV != 2) { - die "Usage: add_disambig.pl [--include-zero] symtab.txt num_extra > symtab_out.txt "; -} - - -$input = $ARGV[0]; -$nsyms = $ARGV[1]; - -open(F, "<$input") || die "Opening file $input"; - -while() { - @A = split(" ", $_); - @A == 2 || die "Bad line $_"; - $lastsym = $A[1]; - print; -} - -if(!defined($lastsym)){ - die "Empty symbol file?"; -} - -if($include_zero) { - $lastsym++; - print "#0 $lastsym\n"; -} - -for($n = 1; $n <= $nsyms; $n++) { - $y = $n + $lastsym; - print "#$n $y\n"; -} diff --git a/egs/timit/s4/utils/add_lex_disambig.pl b/egs/timit/s4/utils/add_lex_disambig.pl deleted file mode 100755 index ded04bb4b49..00000000000 --- a/egs/timit/s4/utils/add_lex_disambig.pl +++ /dev/null @@ -1,101 +0,0 @@ -#!/usr/bin/env perl -# Copyright 2010-2011 Microsoft Corporation - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - - -# Adds disambiguation symbols to a lexicon. -# Outputs still in the normal lexicon format. -# Disambig syms are numbered #1, #2, #3, etc. (#0 -# reserved for symbol in grammar). -# Outputs the number of disambig syms to the standard output. - -if(@ARGV != 2) { - die "Usage: add_lex_disambig.pl lexicon.txt lexicon_disambig.txt " -} - - -$lexfn = shift @ARGV; -$lexoutfn = shift @ARGV; - -open(L, "<$lexfn") || die "Error opening lexicon $lexfn"; - -# (1) Read in the lexicon. -@L = ( ); -while() { - @A = split(" ", $_); - push @L, join(" ", @A); -} - -# (2) Work out the count of each phone-sequence in the -# lexicon. - -foreach $l (@L) { - @A = split(" ", $l); - shift @A; # Remove word. - $count{join(" ",@A)}++; -} - -# (3) For each left sub-sequence of each phone-sequence, note down -# that exists (for identifying prefixes of longer strings). - -foreach $l (@L) { - @A = split(" ", $l); - shift @A; # Remove word. - while(@A > 0) { - pop @A; # Remove last phone - $issubseq{join(" ",@A)} = 1; - } -} - -# (4) For each entry in the lexicon: -# if the phone sequence is unique and is not a -# prefix of another word, no diambig symbol. -# Else output #1, or #2, #3, ... if the same phone-seq -# has already been assigned a disambig symbol. - - -open(O, ">$lexoutfn") || die "Opening lexicon file $lexoutfn for writing.\n"; - -$max_disambig = 0; -foreach $l (@L) { - @A = split(" ", $l); - $word = shift @A; - $phnseq = join(" ",@A); - if(!defined $issubseq{$phnseq} - && $count{$phnseq}==1) { - ; # Do nothing. - } else { - if($phnseq eq "") { # need disambig symbols for the empty string - # that are not use anywhere else. - $max_disambig++; - $reserved{$max_disambig} = 1; - $phnseq = "#$max_disambig"; - } else { - $curnumber = $disambig_of{$phnseq}; - if(!defined{$curnumber}) { $curnumber = 0; } - $curnumber++; # now 1 or 2, ... - while(defined $reserved{$curnumber} ) { $curnumber++; } # skip over reserved symbols - if($curnumber > $max_disambig) { - $max_disambig = $curnumber; - } - $disambig_of{$phnseq} = $curnumber; - $phnseq = $phnseq . " #" . $curnumber; - } - } - print O "$word\t$phnseq\n"; -} - -print $max_disambig . "\n"; - diff --git a/egs/timit/s4/utils/decode.sh b/egs/timit/s4/utils/decode.sh deleted file mode 100755 index d8706cdca0a..00000000000 --- a/egs/timit/s4/utils/decode.sh +++ /dev/null @@ -1,145 +0,0 @@ -#!/bin/bash - -# Copyright 2012 Arnab Ghoshal -# Copyright 2010-2011 Microsoft Corporation - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - -function error_exit () { - echo -e "$@" >&2; exit 1; -} - -function readint () { - local retval=${1/#*=/}; # In case --switch=ARG format was used - retval=${retval#0*} # Strip any leading 0's - [[ "$retval" =~ ^-?[1-9][0-9]*$ ]] \ - || error_exit "Argument \"$retval\" not an integer." - echo $retval -} - -function read_dirname () { - local dir_name=${1/#*=/}; # In case --switch=ARG format was used - [ -d "$dir_name" ] || error_exit "Argument '$dir_name' not a directory"; - local retval=`cd $dir_name 2>/dev/null && pwd || exit 1` - echo $retval -} - -orig_args="$*" -njobs="" # Total number of jobs unset by default. Will set to #speakers (if - # using a grid) or 4 (if not), unless specified by user. -lang="" # Option for sclite scoring (off by default) -opts="" -qcmd="" # Options for the submit_jobs.sh script - -PROG=`basename $0`; -usage="Usage: $PROG [options] [extra-args...]\n\n -Options:\n - --help\t\tPrint this message and exit\n - -l DIR\t\tDirectory to find L_align.fst (needed for sclite scoring)\n - --num-jobs INT\tNumber of parallel jobs to run (default=$njobs).\n - --opts STRING\tOptions for the decoder script\n - --qcmd STRING\tCommand for submitting a job to a grid engine (e.g. qsub) including switches.\n -"; - -while [ $# -gt 0 ]; do - case "${1# *}" in # ${1# *} strips any leading spaces from the arguments - --help) echo -e $usage; exit 0 ;; - -l) - shift; lang=`read_dirname $1`; - [ ! -f "$lang/phones_disambig.txt" -o ! -f "$lang/L_align.fst" ] && \ - error_exit "Invalid argument to -l option; expected $lang/phones_disambig.txt and $lang/L_align.fst to exist." - shift ;; - --num-jobs) - shift; njobs=`readint $1`; - [ $njobs -lt 1 ] && error_exit "--num-jobs arg '$njobs' not positive."; - shift ;; - --opts) - shift; opts="$1"; shift ;; - --qcmd) - shift; qcmd="--qcmd=${1}"; shift ;; - --stage) - shift; stage=`readint $1`; shift ;; - -*) echo "Unknown argument: $1, exiting"; echo -e $usage; exit 1 ;; - *) break ;; # end of options: interpreted as the script to execute - esac -done - - -if [ $# -lt 4 ]; then - error_exit $usage; -fi - -script=$1 -graphdir=$2 -data=$3 -dir=$4 -# Make "dir" an absolute pathname. -dir=`perl -e '($dir,$pwd)= @ARGV; if($dir!~m:^/:) { $dir = "$pwd/$dir"; } print $dir; ' $dir ${PWD}` -mkdir -p $dir || exit 1 -shift;shift;shift;shift; -# Remaining args will be supplied to decoding script. -extra_args=$* - -[ -f path.sh ] && . path.sh - -for file in $script $scp $data/utt2spk; do - if [ ! -f "$file" ]; then - echo "decode.sh: no such file $file" - exit 1 - fi -done - -if [ ! -f $graphdir/HCLG.fst -a ! -f $graphdir/G.fst ]; then - # Note: most scripts expect HCLG.fst in graphdir, but the - # "*_fromlats.sh" script(s) require(s) a "lang" dir in that - # position - echo No such file: $graphdir/HCLG.fst or $graphdir/G.fst - exit 1; -fi - -if [ -z "$njobs" ]; then # Figure out num-jobs; user did not specify. - if [ -z "$qcmd" ]; then - njobs=4 - else # running on queue... - njobs=`utt2spk_to_spk2utt.pl $data/utt2spk | wc -l` - fi -fi - -echo "Decoding with num-jobs = $njobs" -if [[ $njobs -gt 1 || ! -d $data/split$njobs || \ - $data/split$njobs -ot $data/feats.scp ]]; then - split_data.sh $data $njobs -fi - -#for n in `get_splits.pl $njobs`; do -submit_jobs.sh "$qcmd" --njobs=$njobs --log=$dir/partTASK_ID.log \ - $script $opts -j $njobs TASK_ID $graphdir $data $dir $extra_args \ - || error_exit "Error in decoding script: command was decode.sh $orig_args" - -if ls $dir/lat.*.gz >&/dev/null; then - if [ -n "$lang" ]; then - # sclite scoring: $lang directory supplied only for this reason. - [ ! -f $data/stm ] && \ - error_exit "Expected $data/stm to exist (-l only used for sclite scoring)" - score_lats_ctm.sh $dir $lang $data || \ - error_exit "Error in scoring of lattices using sclite." - else - score_lats.sh $dir $graphdir/words.txt $data || \ - error_exit "Error in scoring of latices."; - fi -elif ls $dir/*.txt >&/dev/null; then - score_text.sh $dir $data || error_exit "Error in scoring of hypotheses."; -else - eror_exit "No output found in $dir, not scoring."; -fi diff --git a/egs/timit/s4/utils/filter_scp.pl b/egs/timit/s4/utils/filter_scp.pl deleted file mode 100755 index 17483ae8b37..00000000000 --- a/egs/timit/s4/utils/filter_scp.pl +++ /dev/null @@ -1,42 +0,0 @@ -#!/usr/bin/env perl -use warnings; #sed replacement for -w perl parameter -# Copyright 2010-2011 Microsoft Corporation - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - - -# This script takes a list of utterance-ids or any file whose first field -# of each line is an utterance-id, and filters an scp -# file (or any file whose first field is an utterance id), printing -# out only those lines whose first field is in id_list. - -if(@ARGV < 1 || @ARGV > 2) { - die "Usage: filter_scp.pl id_list [in.scp] > out.scp "; -} - -$idlist = shift @ARGV; -open(F, "<$idlist") || die "Could not open id-list file $idlist"; -while() { - @A = split; - @A>=1 || die "Invalid id-list file line $_"; - $seen{$A[0]} = 1; -} - -while(<>) { - @A = split; - @A > 0 || die "Invalid scp file line $_"; - if($seen{$A[0]}) { - print $_; - } -} diff --git a/egs/timit/s4/utils/int2sym.pl b/egs/timit/s4/utils/int2sym.pl deleted file mode 100755 index ad85ef34993..00000000000 --- a/egs/timit/s4/utils/int2sym.pl +++ /dev/null @@ -1,90 +0,0 @@ -#!/usr/bin/env perl -# Copyright 2010-2011 Microsoft Corporation - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - - -$ignore_noninteger = 0; -$ignore_first_field = 0; -$field = -1; -for($x = 0; $x < 2; $x++) { - if($ARGV[0] eq "--ignore-noninteger") { $ignore_noninteger = 1; shift @ARGV; } - if($ARGV[0] eq "--ignore-first-field") { $ignore_first_field = 1; shift @ARGV; } - if($ARGV[0] eq "--field") { - shift @ARGV; $field = $ARGV[0]+0; shift @ARGV; - if ($field < 1) { die "Bad argument to --field option: $field"; } - } -} - -if ($ignore_first_field && $field > 0) { die "Incompatible options ignore-first-field and field"; } -$zfield = $field-1; # Change to zero-based indexing. - -$symtab = shift @ARGV; -if(!defined $symtab) { - die "Usage: sym2int.pl symtab [input] > output\n"; -} -open(F, "<$symtab") || die "Error opening symbol table file $symtab"; -while() { - @A = split(" ", $_); - @A == 2 || die "bad line in symbol table file: $_"; - $int2sym{$A[1]} = $A[0]; -} - -sub int2sym { - my $a = shift @_; - my $pos = shift @_; - if($a !~ m:^\d+$:) { # not all digits.. - if($ignore_noninteger) { - print $a . " "; - next; - } else { - if($pos == 0) { - die "int2sym.pl: found noninteger token $a (try --ignore-first-field)\n"; - } else { - die "int2sym.pl: found noninteger token $a (try --ignore-noninteger if valid input)\n"; - } - } - } - $s = $int2sym{$a}; - if(!defined ($s)) { - die "int2sym.pl: integer $a not in symbol table $symtab."; - } - return $s; -} - -$error = 0; -while(<>) { - @A = split(" ", $_); - if($ignore_first_field) { - $key = shift @A; - print $key . " "; - } - if ($field != -1) { - if ($zfield <= $#A && $zfield >= 0) { - $a = $A[$zfield]; - $A[$zfield] = int2sym($a, $zfield); - } - print join(" ", @A); - } else { - for ($pos = 0; $pos <= $#A; $pos++) { - $a = $A[$pos]; - $s = int2sym($a, $pos); - print $s . " "; - } - } - print "\n"; -} - - - diff --git a/egs/timit/s4/utils/make_lexicon_fst.pl b/egs/timit/s4/utils/make_lexicon_fst.pl deleted file mode 100755 index 9e088889cc2..00000000000 --- a/egs/timit/s4/utils/make_lexicon_fst.pl +++ /dev/null @@ -1,122 +0,0 @@ -#!/usr/bin/env perl -# Copyright 2010-2011 Microsoft Corporation - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - - -# makes lexicon FST (no pron-probs involved). - -if(@ARGV != 1 && @ARGV != 3 && @ARGV != 4) { - die "Usage: make_lexicon_fst.pl lexicon.txt [silprob silphone [sil_disambig_sym]] lexiconfst.txt" -} - -$lexfn = shift @ARGV; -if(@ARGV == 0) { - $silprob = 0.0; -} elsif (@ARGV == 2){ - ($silprob,$silphone) = @ARGV; -} else { - ($silprob,$silphone,$sildisambig) = @ARGV; -} -if($silprob != 0.0) { - $silprob < 1.0 || die "Sil prob cannot be >= 1.0"; - $silcost = -log($silprob); - $nosilcost = -log(1.0 - $silprob); -} - - -open(L, "<$lexfn") || die "Error opening lexicon $lexfn"; - - - -sub is_sil { - # Return true (1) if provided with a phone-sequence - # that means silence. - # @_ is the parameters of the function - # This function returns true if @_ equals ( $silphone ) - # or something of the form ( "#0", $silphone, "#1" ) - # where the "#0" and "#1" are disambiguation symbols. - return ( @_ == 1 && $_[0] eq $silphone || - (@_ == 3 && $_[1] eq $silphone && - $_[0] =~ m/^\#\d+$/ && - $_[0] =~ m/^\#\d+$/)); -} - -if( $silprob == 0.0 ) { # No optional silences: just have one (loop+final) state which is numbered zero. - $loopstate = 0; - $nexststate = 1; # next unallocated state. - while() { - @A = split(" ", $_); - $w = shift @A; - - $s = $loopstate; - $word_or_eps = $w; - while (@A > 0) { - $p = shift @A; - if(@A > 0) { - $ns = $nextstate++; - } else { - $ns = $loopstate; - } - print "$s\t$ns\t$p\t$word_or_eps\n"; - $word_or_eps = ""; - $s = $ns; - } - } - print "$loopstate\t0\n"; # final-cost. -} else { # have silence probs. - $startstate = 0; - $loopstate = 1; - $silstate = 2; # state from where we go to loopstate after emitting silence. - print "$startstate\t$loopstate\t\t\t$nosilcost\n"; # no silence. - if (!defined $sildisambig) { - print "$startstate\t$loopstate\t$silphone\t\t$silcost\n"; # silence. - print "$silstate\t$loopstate\t$silphone\t\n"; # no cost. - $nextstate = 3; - } else { - $disambigstate = 3; - $nextstate = 4; - print "$startstate\t$disambigstate\t$silphone\t\t$silcost\n"; # silence. - print "$silstate\t$disambigstate\t$silphone\t\n"; # no cost. - print "$disambigstate\t$loopstate\t$sildisambig\t\n"; # silence disambiguation symbol. - } - while() { - @A = split(" ", $_); - $w = shift @A; - - $s = $loopstate; - $word_or_eps = $w; - while (@A > 0) { - $p = shift @A; - if(@A > 0) { - $ns = $nextstate++; - print "$s\t$ns\t$p\t$word_or_eps\n"; - $word_or_eps = ""; - $s = $ns; - } else { - if(!is_sil(@A)){ - # This is non-deterministic but relatively compact, - # and avoids epsilons. - print "$s\t$loopstate\t$p\t$word_or_eps\t$nosilcost\n"; - print "$s\t$silstate\t$p\t$word_or_eps\t$silcost\n"; - } else { - # no point putting opt-sil after silence word. - print "$s\t$loopstate\t$p\t$word_or_eps\n"; - } - $word_or_eps = ""; - } - } - } - print "$loopstate\t0\n"; # final-cost. -} diff --git a/egs/timit/s4/utils/mkgraph.sh b/egs/timit/s4/utils/mkgraph.sh deleted file mode 100755 index 971de31c782..00000000000 --- a/egs/timit/s4/utils/mkgraph.sh +++ /dev/null @@ -1,134 +0,0 @@ -#!/bin/bash -# Copyright 2010-2011 Microsoft Corporation - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - - -# This script creates a fully expanded decoding graph (HCLG) that represents -# all the language-model, pronunciation dictionary (lexicon), context-dependency, -# and HMM structure in our model. The output is a Finite State Transducer -# that has word-ids on the output, and pdf-ids on the input (these are indexes -# that resolve to Gaussian Mixture Models). -# See -# http://kaldi.sourceforge.net/graph_recipe_test.html -# (this is compiled from this repository using Doxygen, -# the source for this part is in src/doc/graph_recipe_test.dox) - - -N=3 -P=1 -clean=false - -for x in 1 2 3; do - if [ $1 == "--mono" ]; then - N=1; - P=0; - shift; - fi - if [ $1 == "--clean" ]; then - clean=true - shift; - fi - -done - -if [ $# != 3 ]; then - echo "Usage: scripts/mkgraph.sh " - echo "e.g.: scripts/mkgraph.sh data/lang_test exp/tri1/ exp/tri1/graph" - exit 1; -fi - -if [ -f path.sh ]; then . path.sh; fi - -lang=$1 -tree=$2/tree -model=$2/final.mdl -dir=$3 - -if $clean; then rm -r $lang/tmp; fi - -mkdir -p $dir - -tscale=1.0 -loopscale=0.1 - -# If $lang/tmp/LG.fst does not exist or is older than its sources, make it... -# (note: the [[ ]] brackets make the || type operators work (inside [ ], we -# would have to use -o instead), -f means file exists, and -ot means older than). - -required="$lang/L.fst $lang/G.fst $lang/phones_disambig.txt $lang/words.txt $lang/silphones.csl $model $tree" -for f in $required; do - [ ! -f $f ] && echo "mkgraph.sh: expected $f to exist" && exit 1; -done - -mkdir -p $lang/tmp -if [[ ! -f $lang/tmp/LG.fst || $lang/tmp/LG.fst -ot $lang/G.fst || \ - $lang/tmp/LG.fst -ot $lang/L_disambig.fst ]]; then - fsttablecompose $lang/L_disambig.fst $lang/G.fst | fstdeterminizestar --use-log=true | \ - fstminimizeencoded > $lang/tmp/LG.fst || exit 1; - fstisstochastic $lang/tmp/LG.fst || echo "warning: LG not stochastic." -fi - -if [ ! -f $lang/phones_disambig.txt ]; then - echo "No such file $lang/phones_disambig.txt (supplied a training lang/ directory?)" - exit 1; -fi - -grep '#' $lang/phones_disambig.txt | awk '{print $2}' > $lang/tmp/disambig_phones.list - - -clg=$lang/tmp/CLG_${N}_${P}.fst - -if [[ ! -f $clg || $clg -ot $lang/tmp/LG.fst ]]; then - fstcomposecontext --context-size=$N --central-position=$P \ - --read-disambig-syms=$lang/tmp/disambig_phones.list \ - --write-disambig-syms=$lang/tmp/disambig_ilabels_${N}_${P}.list \ - $lang/tmp/ilabels_${N}_${P} < $lang/tmp/LG.fst >$clg - fstisstochastic $clg || echo "warning: CLG not stochastic." -fi - -if [[ ! -f $dir/Ha.fst || $dir/Ha.fst -ot $model \ - || $dir/Ha.fst -ot $lang/tmp/ilabels_${N}_${P} ]]; then - make-h-transducer --disambig-syms-out=$dir/disambig_tid.list \ - --transition-scale=$tscale $lang/tmp/ilabels_${N}_${P} $tree $model \ - > $dir/Ha.fst || exit 1; -fi - -if [[ ! -f $dir/HCLGa.fst || $dir/HCLGa.fst -ot $dir/Ha.fst || \ - $dir/HCLGa.fst -ot $clg ]]; then - fsttablecompose $dir/Ha.fst $clg | fstdeterminizestar --use-log=true \ - | fstrmsymbols $dir/disambig_tid.list | fstrmepslocal | \ - fstminimizeencoded > $dir/HCLGa.fst || exit 1; - fstisstochastic $dir/HCLGa.fst || echo "HCLGa is not stochastic" -fi - -if [[ ! -f $dir/HCLG.fst || $dir/HCLG.fst -ot $dir/HCLGa.fst ]]; then - add-self-loops --self-loop-scale=$loopscale --reorder=true \ - $model < $dir/HCLGa.fst > $dir/HCLG.fst || exit 1; - - if [ $tscale == 1.0 -a $loopscale == 1.0 ]; then - # No point doing this test if transition-scale not 1, as it is bound to fail. - fstisstochastic $dir/HCLG.fst || echo "Final HCLG is not stochastic." - fi -fi - -# keep a copy of the lexicon and a list of silence phones with HCLG... -# this means we can decode without refrence to the $lang directory. -cp $lang/words.txt $dir/ -cp $lang/silphones.csl $dir/ - -# to make const fst: -# fstconvert --fst_type=const $dir/HCLG.fst $dir/HCLG_c.fst - -echo "Finished making decoding graphs in $dir" \ No newline at end of file diff --git a/egs/timit/s4/utils/s2eps.pl b/egs/timit/s4/utils/s2eps.pl deleted file mode 100755 index ffeeb8eb6af..00000000000 --- a/egs/timit/s4/utils/s2eps.pl +++ /dev/null @@ -1,27 +0,0 @@ -#!/usr/bin/env perl -# Copyright 2010-2011 Microsoft Corporation - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - -# This script replaces and with (on both input and output sides), -# for the G.fst acceptor. - -while(<>){ - @A = split(" ", $_); - if ( @A >= 4 ) { - if ($A[2] eq "" || $A[2] eq "") { $A[2] = ""; } - if ($A[3] eq "" || $A[3] eq "") { $A[3] = ""; } - } - print join("\t", @A) . "\n"; -} diff --git a/egs/timit/s4/utils/score_lats.sh b/egs/timit/s4/utils/score_lats.sh deleted file mode 100755 index e44eafa2ec4..00000000000 --- a/egs/timit/s4/utils/score_lats.sh +++ /dev/null @@ -1,56 +0,0 @@ -#!/bin/bash - -# Copyright 2012 Arnab Ghoshal -# Copyright 2010-2011 Microsoft Corporation - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - -if [ -f ./path.sh ]; then . ./path.sh; fi - -if [ $# -ne 4 ]; then - echo "Usage: score_lats.sh " - exit 1; -fi - -dir=$1 -symtab=$2 -data=$3 -phonemap=$4 - -if [ ! -f $symtab ]; then - echo No such word symbol table file $symtab - exit 1; -fi -if [ ! -f $data/text ]; then - echo Could not find transcriptions in $data/text - exit 1 -fi - - -trans=$data/text -cp $trans $dir/test.trans - -for inv_acwt in `seq 1 7`; do - acwt=`perl -e "print (1.0/$inv_acwt);"` - lattice-best-path --acoustic-scale=$acwt --word-symbol-table=$symtab \ - "ark:gunzip -c $dir/lat.*.gz|" ark,t:$dir/${inv_acwt}.tra \ - 2>$dir/rescore_${inv_acwt}.log - - cat $dir/${inv_acwt}.tra \ - | int2sym.pl --ignore-first-field $symtab \ - | timit_norm_trans.pl -i - -m $phonemap -from 48 -to 39 | - compute-wer --text --mode=present ark:$dir/test.trans ark,p:- \ - >& $dir/wer_$inv_acwt -done - diff --git a/egs/timit/s4/utils/score_text.sh b/egs/timit/s4/utils/score_text.sh deleted file mode 100755 index 7d8942e4c35..00000000000 --- a/egs/timit/s4/utils/score_text.sh +++ /dev/null @@ -1,50 +0,0 @@ -#!/bin/bash - -# Copyright 2012 Arnab Ghoshal -# Copyright 2010-2011 Microsoft Corporation - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - - -if [ -f ./path.sh ]; then . ./path.sh; fi - -if [ $# -ne 4 ]; then - echo "Usage: score_text.sh " - exit 1; -fi - -dir=$1 -symtab=$2 -data=$3 -phonemap=$4 - -if [ ! -f $data/text ]; then - echo Could not find transcriptions in $data/text - exit 1 -fi - -trans=$data/text -sort -k1,1 $trans > $dir/test.trans - -# We assume the transcripts are already in integer form. -cat $dir/*.tra | sort -k1,1 \ - | int2sym.pl --ignore-first-field $symtab \ - | timit_norm_trans.pl -i - -m $phonemap -from 48 -to 39 \ - > $dir/text - -compute-wer --text --mode=present ark:$dir/test.trans ark,p:$dir/text \ - >& $dir/wer - -grep WER $dir/wer - diff --git a/egs/timit/s4/utils/silphones.pl b/egs/timit/s4/utils/silphones.pl deleted file mode 100755 index 3ff85dfe3bb..00000000000 --- a/egs/timit/s4/utils/silphones.pl +++ /dev/null @@ -1,57 +0,0 @@ -#!/usr/bin/env perl -# Copyright 2010-2011 Microsoft Corporation - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - - -# creates integer lists of silence and non-silence phones in files, -# e.g. silphones.csl="1:2:3 \n" -# and nonsilphones.csl="4:5:6:7:...:24\n"; - -if(@ARGV != 4) { - die "Usage: silphones.pl phones.txt \"sil1 sil2 sil3\" silphones.csl nonsilphones.csl"; -} - -($symtab, $sillist, $silphones, $nonsilphones) = @ARGV; -open(S,"<$symtab") || die "Opening symbol table $symtab"; - - -foreach $s (split(" ", $sillist)) { - $issil{$s} = 1; -} - -@sil = (); -@nonsil = (); -while(){ - @A = split(" ", $_); - @A == 2 || die "Bad line $_ in phone-symbol-table file $symtab"; - ($sym, $int) = @A; - if($int != 0) { - if($issil{$sym}) { push @sil, $int; $seensil{$sym}=1; } - else { push @nonsil, $int; } - } -} - -foreach $k(keys %issil) { - if(!$seensil{$k}) { die "No such silence phone $k"; } -} -open(F, ">$silphones") || die "opening silphones file $silphones"; -open(G, ">$nonsilphones") || die "opening nonsilphones file $nonsilphones"; -print F join(":", @sil) . "\n"; -print G join(":", @nonsil) . "\n"; -close(F); -close(G); -if(@sil == 0) { print STDERR "Warning: silphones.pl no silence phones.\n" } -if(@nonsil == 0) { print STDERR "Warning: silphones.pl no non-silence phones.\n" } - diff --git a/egs/timit/s4/utils/split_data.sh b/egs/timit/s4/utils/split_data.sh deleted file mode 100755 index 19431aa5c6d..00000000000 --- a/egs/timit/s4/utils/split_data.sh +++ /dev/null @@ -1,79 +0,0 @@ -#!/bin/bash -# Copyright 2010-2011 Microsoft Corporation - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - -set -o errexit - -if [ $# != 2 ]; then - echo "Usage: split_data.sh data-dir num-to-split" - exit 1 -fi - -data=$1 -numsplit=$2 - -if [ $numsplit -le 0 ]; then - echo "Invalid num-split argument $numsplit"; - exit 1; -fi - -n=0; -feats="" -wavs="" -utt2spks="" -texts="" - -nu=`cat $data/utt2spk | wc -l` -nf=`cat $data/feats.scp | wc -l` -nt=`cat $data/text | wc -l` -if [ $nu -ne $nf ]; then - echo "split_data.sh: warning, #lines is (utt2spk,feats.scp) is ($nu,$nf);" - echo "this script may produce incorrectly split data." - echo "use utils/fix_data_dir.sh to fix this." -fi -if [ $nt -ne 0 -a $nu -ne $nt ]; then - echo "split_data.sh: warning, #lines is (utt2spk,text) is ($nu,$nt);" - echo "this script may produce incorrectly split data." - echo "use utils/fix_data_dir.sh to fix this." -fi - -# utilsscripts/get_split.pl returns "0 1 2 3" or "00 01 .. 18 19" or whatever. -# for n in `get_splits.pl $numsplit`; do -for n in `seq 1 $numsplit`; do # Changed this to usual number sequence -Arnab - mkdir -p $data/split$numsplit/$n - feats="$feats $data/split$numsplit/$n/feats.scp" - wavs="$wavs $data/split$numsplit/$n/wav.scp" - texts="$texts $data/split$numsplit/$n/text" - utt2spks="$utt2spks $data/split$numsplit/$n/utt2spk" -done - -split_scp.pl --utt2spk=$data/utt2spk $data/utt2spk $utt2spks -split_scp.pl --utt2spk=$data/utt2spk $data/feats.scp $feats -[ -f $data/wav.scp ] && \ - split_scp.pl --utt2spk=$data/utt2spk $data/wav.scp $wavs -[ -f $data/text ] && \ - split_scp.pl --utt2spk=$data/utt2spk $data/text $texts - -# for n in `get_splits.pl $numsplit`; do -for n in `seq 1 $numsplit`; do # Changed this to usual number sequence -Arnab - utt2spk_to_spk2utt.pl $data/split$numsplit/$n/utt2spk \ - > $data/split$numsplit/$n/spk2utt - # for completeness, also split the spk2gender file - [ -f $data/spk2gender ] && \ - filter_scp.pl $data/split$numsplit/$n/spk2utt $data/spk2gender \ - > $data/split$numsplit/$n/spk2gender -done - -exit 0 diff --git a/egs/timit/s4/utils/split_scp.pl b/egs/timit/s4/utils/split_scp.pl deleted file mode 100755 index f1054d323eb..00000000000 --- a/egs/timit/s4/utils/split_scp.pl +++ /dev/null @@ -1,212 +0,0 @@ -#!/usr/bin/env perl -use warnings; #sed replacement for -w perl parameter -# Copyright 2010-2011 Microsoft Corporation - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - - - -# This program splits up any kind of .scp or archive-type file. -# If there is no utt2spk option it will work on any text file and -# will split it up with an approximately equal number of lines in -# each but. -# With the --utt2spk option it will work on anything that has the -# utterance-id as the first entry on each line; the utt2spk file is -# of the form "utterance speaker" (on each line). -# It splits it into equal size chunks as far as it can. If you use -# the utt2spk option it will make sure these chunks coincide with -# speaker boundaries. In this case, if there are more chunks -# than speakers (and in some other circumstances), some of the -# resulting chunks will be empty and it -# will print a warning. -# You will normally call this like: -# split_scp.pl scp scp.1 scp.2 scp.3 ... -# or -# split_scp.pl --utt2spk=utt2spk scp scp.1 scp.2 scp.3 ... -# Note that you can use this script to split the utt2spk file itself, -# e.g. split_scp.pl --utt2spk=utt2spk utt2spk utt2spk.1 utt2spk.2 ... - -# You can also call the scripts like: -# split_scp.pl -j 3 0 scp scp.0 -# [note: with this option, it assumes zero-based indexing of the split parts, -# i.e. the second number must be 0 <= n < num-jobs.] - -$num_jobs = 0; -$job_id = 0; -$utt2spk_file = ""; - -for ($x = 1; $x <= 2; $x++) { - if ($ARGV[0] eq "-j") { - shift @ARGV; - $num_jobs = shift @ARGV; - $job_id = shift @ARGV; - if ($num_jobs <= 0 || $job_id < 0 || $job_id >= $num_jobs) { - die "Invalid num-jobs and job-id: $num_jobs and $job_id"; - } - } - if ($ARGV[0] =~ "--utt2spk=(.+)") { - $utt2spk_file=$1; - shift; - } -} - -if(($num_jobs == 0 && @ARGV < 2) || ($num_jobs > 0 && (@ARGV < 1 || @ARGV > 2))) { - die "Usage: split_scp.pl [--utt2spk=] in.scp out1.scp out2.scp ... \n" . - " or: split_scp.pl -j num-jobs job-id [--utt2spk=] in.scp [out.scp]\n" . - " ... where 0 <= job-id < num-jobs."; -} - -$inscp = shift @ARGV; -if ($num_jobs == 0) { # without -j option - @OUTPUTS = @ARGV; -} else { - for ($j = 0; $j < $num_jobs; $j++) { - if ($j == $job_id) { - if (@ARGV > 0) { push @OUTPUTS, $ARGV[0]; } - else { push @OUTPUTS, "-"; } - } else { - push @OUTPUTS, "/dev/null"; - } - } -} - -if ($utt2spk_file ne "") { # We have the --utt2spk option... - open(U, "<$utt2spk_file") || die "Failed to open utt2spk file $utt2spk_file"; - while() { - @A = split; - @A == 2 || die "Bad line $_ in utt2spk file $utt2spk_file"; - ($u,$s) = @A; - $utt2spk{$u} = $s; - } - open(I, "<$inscp") || die "Opening input scp file $inscp"; - @spkrs = (); - while() { - @A = split; - if(@A == 0) { die "Empty or space-only line in scp file $inscp"; } - $u = $A[0]; - $s = $utt2spk{$u}; - if(!defined $s) { die "No such utterance $u in utt2spk file $utt2spk_file"; } - if(!defined $spk_count{$s}) { - push @spkrs, $s; - $spk_count{$s} = 0; - $spk_data{$s} = ""; - } - $spk_count{$s}++; - $spk_data{$s} = $spk_data{$s} . $_; - } - # Now split as equally as possible .. - # First allocate spks to files by allocating an approximately - # equal number of speakers. - $numspks = @spkrs; # number of speakers. - $numscps = @OUTPUTS; # number of output files. - for($scpidx = 0; $scpidx < $numscps; $scpidx++) { - $scparray[$scpidx] = []; # [] is array reference. - } - for ($spkidx = 0; $spkidx < $numspks; $spkidx++) { - $scpidx = int(($spkidx*$numscps) / $numspks); - $spk = $spkrs[$spkidx]; - push @{$scparray[$scpidx]}, $spk; - $scpcount[$scpidx] += $spk_count{$spk}; - } - - # Now will try to reassign beginning + ending speakers - # to different scp's and see if it gets more balanced. - # Suppose objf we're minimizing is sum_i (num utts in scp[i] - average)^2. - # We can show that if considering changing just 2 scp's, we minimize - # this by minimizing the squared difference in sizes. This is - # equivalent to minimizing the absolute difference in sizes. This - # shows this method is bound to converge. - - $changed = 1; - while($changed) { - $changed = 0; - for($scpidx = 0; $scpidx < $numscps; $scpidx++) { - # First try to reassign ending spk of this scp. - if($scpidx < $numscps-1) { - $sz = @{$scparray[$scpidx]}; - if($sz > 0) { - $spk = $scparray[$scpidx]->[$sz-1]; - $count = $spk_count{$spk}; - $nutt1 = $scpcount[$scpidx]; - $nutt2 = $scpcount[$scpidx+1]; - if( abs( ($nutt2+$count) - ($nutt1-$count)) - < abs($nutt2 - $nutt1)) { # Would decrease - # size-diff by reassigning spk... - $scpcount[$scpidx+1] += $count; - $scpcount[$scpidx] -= $count; - pop @{$scparray[$scpidx]}; - unshift @{$scparray[$scpidx+1]}, $spk; - $changed = 1; - } - } - } - if($scpidx > 0 && @{$scparray[$scpidx]} > 0) { - $spk = $scparray[$scpidx]->[0]; - $count = $spk_count{$spk}; - $nutt1 = $scpcount[$scpidx-1]; - $nutt2 = $scpcount[$scpidx]; - if( abs( ($nutt2-$count) - ($nutt1+$count)) - < abs($nutt2 - $nutt1)) { # Would decrease - # size-diff by reassigning spk... - $scpcount[$scpidx-1] += $count; - $scpcount[$scpidx] -= $count; - shift @{$scparray[$scpidx]}; - push @{$scparray[$scpidx-1]}, $spk; - $changed = 1; - } - } - } - } - # Now print out the files... - for($scpidx = 0; $scpidx < $numscps; $scpidx++) { - $scpfn = $OUTPUTS[$scpidx]; - open(F, ">$scpfn") || die "Could not open scp file $scpfn for writing."; - $count = 0; - if(@{$scparray[$scpidx]} == 0) { - print STDERR "Warning: split_scp.pl producing empty .scp file $scpfn (too many splits and too few speakers?)\n"; - } else { - foreach $spk ( @{$scparray[$scpidx]} ) { - print F $spk_data{$spk}; - $count += $spk_count{$spk}; - } - if($count != $scpcount[$scpidx]) { die "Count mismatch [code error]"; } - } - close(F); - } -} else { - # This block is the "normal" case where there is no --utt2spk - # option and we just break into equal size chunks. - - open(I, "<$inscp") || die "Opening input scp file $inscp"; - - $numscps = @OUTPUTS; # size of array. - @F = (); - while() { - push @F, $_; - } - $numlines = @F; - if($numlines == 0) { - print STDERR "split_scp.pl: warning: empty input scp file $inscp"; - } - $linesperscp = int( ($numlines+($numscps-1)) / $numscps); # the +$(numscps-1) forces rounding up. -# [just doing int() rounds down]. - for($scpidx = 0; $scpidx < @OUTPUTS; $scpidx++) { - $scpfile = $OUTPUTS[$scpidx]; - open(O, ">$scpfile") || die "Opening output scp file $scpfile"; - for($n = $linesperscp * $scpidx; $n < $numlines && $n < $linesperscp*($scpidx+1); $n++) { - print O $F[$n]; - } - close(O) || die "Closing scp file $scpfile"; - } -} diff --git a/egs/timit/s4/utils/submit_jobs.sh b/egs/timit/s4/utils/submit_jobs.sh deleted file mode 100755 index 98e17c763fb..00000000000 --- a/egs/timit/s4/utils/submit_jobs.sh +++ /dev/null @@ -1,125 +0,0 @@ -#!/bin/bash -u - -# Copyright 2012 Arnab Ghoshal - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - -set -o errexit - -function error_exit () { - echo -e "$@" >&2; exit 1; -} - -function readposint () { - local retval=`expr "X$1" : '[^=]*=\(.*\)'`; - retval=${retval#0*} # Strip any leading 0's - [[ "$retval" =~ ^[1-9][0-9]*$ ]] \ - || error_exit "Argument \"$retval\" not a positive integer." - echo $retval -} - -PROG=`basename $0`; -usage="Usage: $PROG [options] --log=logfile command\n -Runs the supplied command and redirect the stdout & stderr to logfile.\n -With the --qcmd option, the command is submitted to a grid engine.\n -Any 'TASK_ID' in logfile or command is replaced with job number or \$SGE_TASK_ID (for SGE).\n\n -Required arguments:\n - --log=FILE\tOutput of command redirected to this file.\n\n -Options:\n - --njobs=INT\tNumber of jobs to run (default=1). Assumes split data exists.\n - --qcmd=STRING\tCommand for submitting a job to a grid engine (e.g. qsub) including switches.\n -"; - -if [ $# -lt 2 ]; then - error_exit $usage; -fi - -NJOBS=1 # Default number of jobs -QCMD="" # No grid usage by default -while [ $# -gt 1 ]; do - case "${1# *}" in # ${1# *} strips any leading spaces from the arguments - --help) echo -e $usage; exit 0 ;; - --qcmd=*) - QCMD=`expr "X$1" : '[^=]*=\(.*\)'`; shift ;; - --njobs=*) - NJOBS=`readposint $1`; shift ;; - --log=*) - LOGF=`expr "X$1" : '[^=]*=\(.*\)'`; shift ;; - -*) echo "Unknown argument: $1, exiting"; echo -e $usage; exit 1 ;; - '') shift ;; # Handle any empty arguments - *) break ;; # interpreted as the command to execute - esac -done - -logfile_base=`basename $LOGF .log` -logfile_dir=`dirname $LOGF` -mkdir -p $logfile_dir; - -# Now, parse the command to execute -exec_cmd=""; -while [ $# -gt 0 ]; do - case "$1" in - *\"*) exec_cmd=$exec_cmd"'''$1''' "; shift ;; - *\ *) exec_cmd=$exec_cmd"\"$1\" "; shift ;; - *) exec_cmd=$exec_cmd"$1 "; shift ;; - esac -done - -function run_locally { - rm -f $logfile_dir/.error; - for n in `seq 1 $NJOBS`; do - local this_logfile=${logfile_base//TASK_ID/$n} - this_logfile=$logfile_dir"/"$this_logfile".log" - local this_command=${exec_cmd//TASK_ID/$n} - ( echo -e "# Command:\n# $this_command"; - echo "# Running on: "`hostname`; - echo "# Started at: "`date`; - eval $this_command || touch $logfile_dir/.error - echo "# Finished at: "`date` ) >> $this_logfile 2>&1 & - done - wait; - [ -f $logfile_dir/.error ] && { rm -f $logfile_dir/.error; \ - error_exit "One (or more) locally run jobs failed."; } - exit 0; -} - -function run_on_grid { - local this_logfile=${logfile_base//TASK_ID/\$SGE_TASK_ID} - this_logfile=$logfile_dir"/"$this_logfile".log" - # If log files are in a separate 'log' directory, create the job submission - # scripts one level up. - local qdir=${logfile_dir/%log/q} - mkdir -p $qdir - local qlog=$qdir/queue.log - local this_command=${exec_cmd//TASK_ID/\$SGE_TASK_ID} - local run_this=$qdir"/"${logfile_base//TASK_ID/}".sh" - run_this=${run_this//../.} - printf "#!/bin/bash\n#\$ -S /bin/bash\n#\$ -V -cwd -j y\n" > $run_this - { printf "set -e\n"; - printf "{ cd %s\n . path.sh\n echo Running on: \`hostname\`\n" "$PWD"; - printf " echo Started at: \`date\`\n $this_command\n ret=\$\?\n"; - printf " echo Finished at: \`date\`\n} >& %s\nexit \$ret\n" "$this_logfile" - printf "# Submitted with:\n" - printf "# $QCMD -sync y -o $qlog -t 1-$NJOBS $run_this >> $qlog 2>&1\n" - } >> $run_this - $QCMD -sync y -o $qlog -t 1-${NJOBS} $run_this >> $qlog 2>&1 - exit $? -} - -if [ -z "$QCMD" ]; then - run_locally; -else - run_on_grid; -fi - diff --git a/egs/timit/s4/utils/sym2int.pl b/egs/timit/s4/utils/sym2int.pl deleted file mode 100755 index 71492652c50..00000000000 --- a/egs/timit/s4/utils/sym2int.pl +++ /dev/null @@ -1,82 +0,0 @@ -#!/usr/bin/env perl -# Copyright 2010-2011 Microsoft Corporation - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - - -$ignore_oov = 0; -$ignore_first_field = 0; -for($x = 0; $x < 3; $x++) { - # Note: it will just print OOVS unmodified if you specify --ignore-oov. - # Else will complain and put nothing out. - if($ARGV[0] eq "--ignore-oov") { $ignore_oov = 1; shift @ARGV; } - if($ARGV[0] eq "--ignore-first-field") { $ignore_first_field = 1; shift @ARGV; } - if($ARGV[0] eq "--map-oov") { shift @ARGV; $map_oov = shift @ARGV; } -} - -$symtab = shift @ARGV; -if(!defined $symtab) { - die "Usage: sym2int.pl symtab [input transcriptions] > output transcriptions\n"; -} -open(F, "<$symtab") || die "Error opening symbol table file $symtab"; -while() { - @A = split(" ", $_); - @A == 2 || die "bad line in symbol table file: $_"; - $sym2int{$A[0]} = $A[1] + 0; -} - -$num_warning = 0; -$max_warning = 20; -$error = 0; -while(<>) { - @A = split(" ", $_); - if(@A == 0) { - die "Empty line in transcriptions input."; - } - if($ignore_first_field) { - $key = shift @A; - print $key . " "; - } - @B = (); - foreach $a (@A) { - $i = $sym2int{$a}; - if(!defined ($i)) { - if (defined $map_oov) { - if (!defined $sym2int{$map_oov}) { - die "sym2int.pl: invalid map-oov option $map_oov (symbol not defined in $symtab)"; - } - if ($num_warning++ < $max_warning) { - print STDERR "sym2int.pl: replacing $a with $map_oov\n"; - if ($num_warning == $max_warning) { - print STDERR "sym2int.pl: not warning for OOVs any more times\n"; - } - } - $i = $sym2int{$map_oov}; - } elsif($ignore_oov) { - $i = $a; # just print them out unmodified.. - } else { - die "sym2int.pl: undefined symbol $a\n"; - } - } - push @B, $i; - } - print join(" ", @B); - print "\n"; -} - -if($error) { exit(1); } -else { exit(0); } - - - diff --git a/egs/timit/s4/utils/utt2spk_to_spk2utt.pl b/egs/timit/s4/utils/utt2spk_to_spk2utt.pl deleted file mode 100755 index 0dfb7ba5fd3..00000000000 --- a/egs/timit/s4/utils/utt2spk_to_spk2utt.pl +++ /dev/null @@ -1,39 +0,0 @@ -#!/usr/bin/env perl -# Copyright 2010-2011 Microsoft Corporation - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - -# converts an utt2spk file to a spk2utt file. -# Takes input from the stdin or from a file argument; -# output goes to the standard out. - -if ( @ARGV > 1 ) { - die "Usage: utt2spk_to_spk2utt.pl [ utt2spk ] > spk2utt"; -} - -while(<>){ - @A = split(" ", $_); - @A == 2 || die "Invalid line in utt2spk file: $_"; - ($u,$s) = @A; - if(!$seen_spk{$s}) { - $seen_spk{$s} = 1; - push @spklist, $s; - } - $uttlist{$s} = $uttlist{$s} . "$u "; -} -foreach $s (@spklist) { - $l = $uttlist{$s}; - $l =~ s: $::; # remove trailing space. - print "$s $l\n"; -} diff --git a/egs/timit/s5/cmd.sh b/egs/timit/s5/cmd.sh index fd91a53ff73..5abbfd4495a 100644 --- a/egs/timit/s5/cmd.sh +++ b/egs/timit/s5/cmd.sh @@ -1,36 +1,31 @@ -# "queue.pl" uses qsub. The options to it are -# options to qsub. If you have GridEngine installed, -# change this to a queue you have access to. -# Otherwise, use "run.pl", which will run jobs locally -# (make sure your --num-jobs options are no more than -# the number of cpus on your machine. +# you can change cmd.sh depending on what type of queue you are using. +# If you have no queueing system and want to run on a local machine, you +# can change all instances 'queue.pl' to run.pl (but be careful and run +# commands one by one: most recipes will exhaust the memory on your +# machine). queue.pl works with GridEngine (qsub). slurm.pl works +# with slurm. Different queues are configured differently, with different +# queue names and different ways of specifying things like memory; +# to account for these differences you can create and edit the file +# conf/queue.conf to match your queue's configuration. Search for +# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, +# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. -#a) JHU cluster options -#export train_cmd="queue.pl -l arch=*64" -#export decode_cmd="queue.pl -l arch=*64,mem_free=2G,ram_free=2G" -#export mkgraph_cmd="queue.pl -l arch=*64,ram_free=4G,mem_free=4G" -#export cuda_cmd=run.pl +export train_cmd="queue.pl --mem 4G" +export decode_cmd="queue.pl --mem 4G" +export mkgraph_cmd="queue.pl --mem 8G" +# the use of cuda_cmd is deprecated but it's still sometimes used in nnet1 +# example scripts. +export cuda_cmd="queue.pl --gpu 1" - -if [[ $(hostname -f) == *.clsp.jhu.edu ]]; then - export train_cmd="queue.pl -l arch=*64*" - export decode_cmd="queue.pl -l arch=*64* --mem 3G" - export mkgraph_cmd="queue.pl -l arch=*64* --mem 4G" - export cuda_cmd="queue.pl -l gpu=1" -elif [[ $(hostname -f) == *.fit.vutbr.cz ]]; then +# the rest of this file is present for historical reasons. +# for cluster-specific configuration it's better to rely on conf/queue.conf. +if [ "$(hostname -d)" == "fit.vutbr.cz" ]; then #b) BUT cluster options - queue="all.q@@blade,all.q@@speech,all.q@dellgpu*,all.q@supergpu*" - export train_cmd="queue.pl -q $queue -l ram_free=2500M,mem_free=2500M,matylda5=0.5" - export decode_cmd="queue.pl -q $queue -l ram_free=3000M,mem_free=3000M,matylda5=0.1" - export mkgraph_cmd="queue.pl -q $queue -l ram_free=4G,mem_free=4G,matylda5=3" - export cuda_cmd="queue.pl -q long.q@pcspeech-gpu,long.q@dellgpu1,long.q@pcgpu*,long.q@supergpu1 -l gpu=1" -else - echo "$0: you need to define options for your cluster." - exit 1; + queue="all.q@@blade,all.q@@speech" + gpu_queue="long.q@@gpu" + storage="matylda5" + export train_cmd="queue.pl -q $queue -l ram_free=1.5G,mem_free=1.5G,${storage}=0.5" + export decode_cmd="queue.pl -q $queue -l ram_free=2.5G,mem_free=2.5G,${storage}=0.1" + export cuda_cmd="queue.pl -q $gpu_queue -l gpu=1" fi -#c) run locally... -#export train_cmd=run.pl -#export decode_cmd=run.pl -#export cuda_cmd=run.pl -#export mkgraph_cmd=run.pl diff --git a/egs/timit/s5/local/score_basic.sh b/egs/timit/s5/local/score_basic.sh index 102f2028635..2dbffe38e80 100755 --- a/egs/timit/s5/local/score_basic.sh +++ b/egs/timit/s5/local/score_basic.sh @@ -55,6 +55,6 @@ $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/score_basic.LMWT.log \ utils/int2sym.pl -f 2- $symtab \| \ local/timit_norm_trans.pl -i - -m $phonemap -from 48 -to 39 \| \ compute-wer --text --mode=all \ - ark:$dir/scoring/test_filt.txt ark,p:- $dir/scoring/wer_stats_LMWT ">&" $dir/wer_LMWT || exit 1; + ark:$dir/scoring/test_filt.txt ark,p:- ">&" $dir/wer_LMWT || exit 1; exit 0; diff --git a/egs/timit/s5/local/timit_format_data.sh b/egs/timit/s5/local/timit_format_data.sh index 019d74dcfc7..4e8816a6799 100755 --- a/egs/timit/s5/local/timit_format_data.sh +++ b/egs/timit/s5/local/timit_format_data.sh @@ -16,7 +16,7 @@ tmpdir=data/local/lm_tmp lexicon=data/local/dict/lexicon.txt mkdir -p $tmpdir -for x in train dev test; do +for x in train dev test; do mkdir -p data/$x cp $srcdir/${x}_wav.scp data/$x/wav.scp || exit 1; cp $srcdir/$x.text data/$x/text || exit 1; @@ -37,13 +37,10 @@ for lm_suffix in bg; do test=data/lang_test_${lm_suffix} mkdir -p $test cp -r data/lang/* $test - + gunzip -c $lmdir/lm_phone_${lm_suffix}.arpa.gz | \ - egrep -v ' | | ' | \ - arpa2fst - | fstprint | \ - utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$test/words.txt \ - --osymbols=$test/words.txt --keep_isymbols=false --keep_osymbols=false | \ - fstrmepsilon | fstarcsort --sort_type=ilabel > $test/G.fst + arpa2fst --disambig-symbol=#0 \ + --read-symbol-table=$test/words.txt - $test/G.fst fstisstochastic $test/G.fst # The output is like: # 9.14233e-05 -0.259833 @@ -61,7 +58,7 @@ for lm_suffix in bg; do < "$lexicon" >$tmpdir/g/select_empty.fst.txt fstcompile --isymbols=$test/words.txt --osymbols=$test/words.txt $tmpdir/g/select_empty.fst.txt | \ fstarcsort --sort_type=olabel | fstcompose - $test/G.fst > $tmpdir/g/empty_words.fst - fstinfo $tmpdir/g/empty_words.fst | grep cyclic | grep -w 'y' && + fstinfo $tmpdir/g/empty_words.fst | grep cyclic | grep -w 'y' && echo "Language model has cycles with empty words" && exit 1 rm -r $tmpdir/g done diff --git a/egs/timit/s5/path.sh b/egs/timit/s5/path.sh index 1e48f21b323..62794699b41 100755 --- a/egs/timit/s5/path.sh +++ b/egs/timit/s5/path.sh @@ -1,4 +1,6 @@ export KALDI_ROOT=`pwd`/../../.. -[ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh -export PATH=$PWD/utils/:$KALDI_ROOT/src/bin:$KALDI_ROOT/tools/openfst/bin:$KALDI_ROOT/tools/irstlm/bin/:$KALDI_ROOT/src/fstbin/:$KALDI_ROOT/src/gmmbin/:$KALDI_ROOT/src/featbin/:$KALDI_ROOT/src/lm/:$KALDI_ROOT/src/sgmmbin/:$KALDI_ROOT/src/sgmm2bin/:$KALDI_ROOT/src/fgmmbin/:$KALDI_ROOT/src/latbin/:$KALDI_ROOT/src/nnetbin:$KALDI_ROOT/src/nnet2bin/:$KALDI_ROOT/src/kwsbin:$PWD:$PATH +[ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh +export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$KALDI_ROOT/tools/irstlm/bin/:$PWD:$PATH +[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1 +. $KALDI_ROOT/tools/config/common_path.sh export LC_ALL=C diff --git a/egs/voxforge/online_demo/run.sh b/egs/voxforge/online_demo/run.sh index 938061ebb4f..6a7e89991b6 100755 --- a/egs/voxforge/online_demo/run.sh +++ b/egs/voxforge/online_demo/run.sh @@ -3,6 +3,8 @@ # Copyright 2012 Vassil Panayotov # Apache 2.0 +# Note: you have to do 'make ext' in ../../../src/ before running this. + # Set the paths to the binaries and scripts needed KALDI_ROOT=`pwd`/../../.. export PATH=$PWD/../s5/utils/:$KALDI_ROOT/src/onlinebin:$KALDI_ROOT/src/bin:$PATH @@ -29,7 +31,7 @@ audio=${data_file}/audio if [ ! -s ${data_file}.tar.bz2 ]; then echo "Downloading test models and data ..." wget -T 10 -t 3 $data_url; - + if [ ! -s ${data_file}.tar.bz2 ]; then echo "Download of $data_file has failed!" exit 1 @@ -53,11 +55,11 @@ case $test_mode in echo " estimated on an audio book's text. The text in question is" echo " \"King Solomon's Mines\" (http://www.gutenberg.org/ebooks/2166)." echo " You may want to read some sentences from this book first ..." - echo + echo online-gmm-decode-faster --rt-min=0.5 --rt-max=0.7 --max-active=4000 \ --beam=12.0 --acoustic-scale=0.0769 $ac_model/model $ac_model/HCLG.fst \ $ac_model/words.txt '1:2:3:4:5' $trans_matrix;; - + simulated) echo echo -e " SIMULATED ONLINE DECODING - pre-recorded audio is used\n" @@ -70,7 +72,7 @@ case $test_mode in echo " NOTE: Using utterances from the book, on which the LM was estimated" echo " is considered to be \"cheating\" and we are doing this only for" echo " the purposes of the demo." - echo + echo echo " You can type \"./run.sh --test-mode live\" to try it using your" echo " own voice!" echo @@ -87,7 +89,7 @@ case $test_mode in scp:$decode_dir/input.scp $ac_model/model $ac_model/HCLG.fst \ $ac_model/words.txt '1:2:3:4:5' ark,t:$decode_dir/trans.txt \ ark,t:$decode_dir/ali.txt $trans_matrix;; - + *) echo "Invalid test mode! Should be either \"live\" or \"simulated\"!"; exit 1;; @@ -97,7 +99,7 @@ esac if [ $test_mode == "simulated" ]; then # Convert the reference transcripts from symbols to word IDs sym2int.pl -f 2- $ac_model/words.txt < $audio/trans.txt > $decode_dir/ref.txt - + # Compact the hypotheses belonging to the same test utterance cat $decode_dir/trans.txt |\ sed -e 's/^\(test[0-9]\+\)\([^ ]\+\)\(.*\)/\1 \3/' |\ diff --git a/egs/voxforge/s5/cmd.sh b/egs/voxforge/s5/cmd.sh index 2d454050669..71dd849a93b 100644 --- a/egs/voxforge/s5/cmd.sh +++ b/egs/voxforge/s5/cmd.sh @@ -1,14 +1,15 @@ -# "queue.pl" uses qsub. The options to it are -# options to qsub. If you have GridEngine installed, -# change this to a queue you have access to. -# Otherwise, use "run.pl", which will run jobs locally -# (make sure your --num-jobs options are no more than -# the number of cpus on your machine. - -#export train_cmd="queue.pl -q all.q@a*.clsp.jhu.edu" -#export decode_cmd="queue.pl -q all.q@a*.clsp.jhu.edu" -export train_cmd=run.pl -export decode_cmd=run.pl - - - +# you can change cmd.sh depending on what type of queue you are using. +# If you have no queueing system and want to run on a local machine, you +# can change all instances 'queue.pl' to run.pl (but be careful and run +# commands one by one: most recipes will exhaust the memory on your +# machine). queue.pl works with GridEngine (qsub). slurm.pl works +# with slurm. Different queues are configured differently, with different +# queue names and different ways of specifying things like memory; +# to account for these differences you can create and edit the file +# conf/queue.conf to match your queue's configuration. Search for +# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, +# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. + +export train_cmd="queue.pl --mem 2G" +export decode_cmd="queue.pl --mem 4G" +export mkgraph_cmd="queue.pl --mem 8G" diff --git a/egs/voxforge/s5/local/voxforge_format_data.sh b/egs/voxforge/s5/local/voxforge_format_data.sh index 910be33bd75..6abaf6c7656 100755 --- a/egs/voxforge/s5/local/voxforge_format_data.sh +++ b/egs/voxforge/s5/local/voxforge_format_data.sh @@ -12,7 +12,7 @@ tmpdir=data/local/lm_tmp lexicon=data/local/dict/lexicon.txt mkdir -p $tmpdir -for x in train test; do +for x in train test; do mkdir -p data/$x cp $srcdir/${x}_wav.scp data/$x/wav.scp || exit 1; cp $srcdir/${x}_trans.txt data/$x/text || exit 1; @@ -33,22 +33,8 @@ for f in phones.txt words.txt phones.txt L.fst L_disambig.fst phones; do cp -r data/lang/$f $test done cat $lmdir/lm.arpa | \ - utils/find_arpa_oovs.pl $test/words.txt > $tmpdir/oovs.txt - -# grep -v ' ' because the LM seems to have some strange and useless -# stuff in it with multiple 's in the history. Encountered some other similar -# things in a LM from Geoff. Removing all "illegal" combinations of and , -# which are supposed to occur only at being/end of utt. These can cause -# determinization failures of CLG [ends up being epsilon cycles]. -cat $lmdir/lm.arpa | \ - grep -v ' ' | \ - grep -v ' ' | \ - grep -v ' ' | \ - arpa2fst - | fstprint | \ - utils/remove_oovs.pl $tmpdir/oovs.txt | \ - utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$test/words.txt \ - --osymbols=$test/words.txt --keep_isymbols=false --keep_osymbols=false | \ - fstrmepsilon | fstarcsort --sort_type=ilabel > $test/G.fst + arpa2fst --disambig-symbol=#0 \ + --read-symbol-table=$test/words.txt - $test/G.fst fstisstochastic $test/G.fst # The output is like: # 9.14233e-05 -0.259833 @@ -67,9 +53,8 @@ awk '{if(NF==1){ printf("0 0 %s %s\n", $1,$1); }} END{print "0 0 #0 #0"; print " fstcompile --isymbols=$test/words.txt --osymbols=$test/words.txt \ $tmpdir/g/select_empty.fst.txt | \ fstarcsort --sort_type=olabel | fstcompose - $test/G.fst > $tmpdir/g/empty_words.fst -fstinfo $tmpdir/g/empty_words.fst | grep cyclic | grep -w 'y' && +fstinfo $tmpdir/g/empty_words.fst | grep cyclic | grep -w 'y' && echo "Language model has cycles with empty words" && exit 1 rm -rf $tmpdir echo "*** Succeeded in formatting data." - diff --git a/egs/voxforge/s5/path.sh b/egs/voxforge/s5/path.sh index 6740f11d675..d5ee6268bae 100755 --- a/egs/voxforge/s5/path.sh +++ b/egs/voxforge/s5/path.sh @@ -1,5 +1,8 @@ export KALDI_ROOT=`pwd`/../../.. -export PATH=$PWD/utils/:$KALDI_ROOT/src/bin:$KALDI_ROOT/tools/openfst/bin:$KALDI_ROOT/src/fstbin/:$KALDI_ROOT/src/gmmbin/:$KALDI_ROOT/src/featbin/:$KALDI_ROOT/src/lm/:$KALDI_ROOT/src/sgmmbin/:$KALDI_ROOT/src/sgmm2bin/:$KALDI_ROOT/src/fgmmbin/:$KALDI_ROOT/src/latbin/:$PWD:$PATH +[ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh +export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH +[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1 +. $KALDI_ROOT/tools/config/common_path.sh # VoxForge data will be stored in: export DATA_ROOT="/home/dpovey/kaldi-clean/egs/voxforge/s5/voxforge" # e.g. something like /media/secondary/voxforge diff --git a/egs/vystadial_cz/s5/cmd.sh b/egs/vystadial_cz/s5/cmd.sh index 0900744b5ae..bb0b5337cdb 100644 --- a/egs/vystadial_cz/s5/cmd.sh +++ b/egs/vystadial_cz/s5/cmd.sh @@ -1,22 +1,20 @@ -# "queue.pl" uses qsub. The options to it are -# options to qsub. If you have GridEngine installed, -# change this to a queue you have access to. -# Otherwise, use "run.pl", which will run jobs locally -# (make sure your --num-jobs options are no more than -# the number of cpus on your machine. +# you can change cmd.sh depending on what type of queue you are using. +# If you have no queueing system and want to run on a local machine, you +# can change all instances 'queue.pl' to run.pl (but be careful and run +# commands one by one: most recipes will exhaust the memory on your +# machine). queue.pl works with GridEngine (qsub). slurm.pl works +# with slurm. Different queues are configured differently, with different +# queue names and different ways of specifying things like memory; +# to account for these differences you can create and edit the file +# conf/queue.conf to match your queue's configuration. Search for +# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, +# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. -#export train_cmd="queue.pl -q all.q@a*.clsp.jhu.edu" -#export decode_cmd="queue.pl -q all.q@a*.clsp.jhu.edu" -# export train_cmd="queue.pl -l mf=5g" -# export decode_cmd="queue.pl -l mf=5g" -export train_cmd="queue.pl -l arch=*64*" -export decode_cmd="queue.pl -l arch=*64*" +export train_cmd="queue.pl --mem 2G" +export decode_cmd="queue.pl --mem 4G" +export mkgraph_cmd="queue.pl --mem 8G" -# The number of parallel jobs to be started for some parts of the recipe -# Make sure you have enough resources(CPUs and RAM) to accomodate this number of jobs -njobs=20 - -# If you have no GridEngine you can do: -#export train_cmd=run.pl -#export decode_cmd=run.pl -#njobs=2 +# this controls the number of parallel decoding jobs launched in run.sh if you +# are running locally (e.g. with run.pl) you can reduce it to control memory +# usage. +export njobs=20 diff --git a/egs/vystadial_cz/s5/local/create_G.sh b/egs/vystadial_cz/s5/local/create_G.sh index 7be19f7f03f..b462b9eab01 100755 --- a/egs/vystadial_cz/s5/local/create_G.sh +++ b/egs/vystadial_cz/s5/local/create_G.sh @@ -17,7 +17,7 @@ for lm in $LMs ; do lmp=$lmdir/`basename $lm` tmpdir=$tgt/tmp - mkdir -p $tgt + mkdir -p $tgt mkdir -p $tmpdir echo "--- Preparing the grammar transducer (G.fst) from $lmp in $tgt ..." @@ -26,21 +26,9 @@ for lm in $LMs ; do ln -s $langdir/$f $tgt/$f 2> /dev/null done - cat $lmp | utils/find_arpa_oovs.pl $tgt/words.txt > $tmpdir/oovs.txt - - # grep -v ' ' because the LM seems to have some strange and useless - # stuff in it with multiple 's in the history. Encountered some other similar - # things in a LM from Geoff. Removing all "illegal" combinations of and , - # which are supposed to occur only at being/end of utt. These can cause - # determinization failures of CLG [ends up being epsilon cycles]. - cat $lmp | \ - grep -v ' \| \| ' | \ - arpa2fst - | fstprint | \ - utils/remove_oovs.pl $tmpdir/oovs.txt | \ - utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$tgt/words.txt \ - --osymbols=$tgt/words.txt --keep_isymbols=false --keep_osymbols=false | \ - fstrmepsilon | fstarcsort --sort_type=ilabel > $tgt/G.fst + arpa2fst --disambig-symbol=#0 \ + --read-symbol-table=$tgt/words.txt - $tgt/G.fst fstisstochastic $tgt/G.fst # The output is like: # 9.14233e-05 -0.259833 @@ -48,7 +36,7 @@ for lm in $LMs ; do # nonzero because the backoff weights make the states sum to >1). # Because of the fiasco for these particular LMs, the first number is not # as close to zero as it could be. - + # Everything below is only for diagnostic. # Checking that G has no cycles with empty words on them (e.g. , ); # this might cause determinization failure of CLG. @@ -59,7 +47,7 @@ for lm in $LMs ; do fstcompile --isymbols=$tgt/words.txt --osymbols=$tgt/words.txt \ $tmpdir/g/select_empty.fst.txt | \ fstarcsort --sort_type=olabel | fstcompose - $tgt/G.fst > $tmpdir/g/empty_words.fst - fstinfo $tmpdir/g/empty_words.fst | grep cyclic | grep -w 'y' && + fstinfo $tmpdir/g/empty_words.fst | grep cyclic | grep -w 'y' && echo "Language model has cycles with empty words" && exit 1 # rm -rf $tmpdir # TODO debugging diff --git a/egs/vystadial_cz/s5/path.sh b/egs/vystadial_cz/s5/path.sh index 98bd2fab462..4fa5bb91042 100755 --- a/egs/vystadial_cz/s5/path.sh +++ b/egs/vystadial_cz/s5/path.sh @@ -1,9 +1,12 @@ # Needed for "correct" sorting +export KALDI_ROOT=`pwd`/../../.. +[ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh +export PATH=$PWD/utils:$PWD/steps:$KALDI_ROOT/tools/openfst/bin:$KALDI_ROOT/tools/irstlm/bin/:$PWD:$PATH +[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1 +. $KALDI_ROOT/tools/config/common_path.sh export LC_ALL=C -export KALDI_ROOT=../../.. # adding Kaldi binaries to path -export PATH=$KALDI_ROOT/src/bin:$KALDI_ROOT/tools/openfst/bin:$KALDI_ROOT/tools/irstlm/bin/:$KALDI_ROOT/src/fstbin/:$KALDI_ROOT/src/gmmbin/:$KALDI_ROOT/src/featbin/:$KALDI_ROOT/src/lm/:$KALDI_ROOT/src/sgmmbin/:$KALDI_ROOT/src/sgmm2bin/:$KALDI_ROOT/src/fgmmbin/:$KALDI_ROOT/src/latbin/:$PWD:$PWD/utils:$PWD/steps:$PATH diff --git a/egs/vystadial_en/s5/cmd.sh b/egs/vystadial_en/s5/cmd.sh index 0900744b5ae..bb0b5337cdb 100644 --- a/egs/vystadial_en/s5/cmd.sh +++ b/egs/vystadial_en/s5/cmd.sh @@ -1,22 +1,20 @@ -# "queue.pl" uses qsub. The options to it are -# options to qsub. If you have GridEngine installed, -# change this to a queue you have access to. -# Otherwise, use "run.pl", which will run jobs locally -# (make sure your --num-jobs options are no more than -# the number of cpus on your machine. +# you can change cmd.sh depending on what type of queue you are using. +# If you have no queueing system and want to run on a local machine, you +# can change all instances 'queue.pl' to run.pl (but be careful and run +# commands one by one: most recipes will exhaust the memory on your +# machine). queue.pl works with GridEngine (qsub). slurm.pl works +# with slurm. Different queues are configured differently, with different +# queue names and different ways of specifying things like memory; +# to account for these differences you can create and edit the file +# conf/queue.conf to match your queue's configuration. Search for +# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, +# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. -#export train_cmd="queue.pl -q all.q@a*.clsp.jhu.edu" -#export decode_cmd="queue.pl -q all.q@a*.clsp.jhu.edu" -# export train_cmd="queue.pl -l mf=5g" -# export decode_cmd="queue.pl -l mf=5g" -export train_cmd="queue.pl -l arch=*64*" -export decode_cmd="queue.pl -l arch=*64*" +export train_cmd="queue.pl --mem 2G" +export decode_cmd="queue.pl --mem 4G" +export mkgraph_cmd="queue.pl --mem 8G" -# The number of parallel jobs to be started for some parts of the recipe -# Make sure you have enough resources(CPUs and RAM) to accomodate this number of jobs -njobs=20 - -# If you have no GridEngine you can do: -#export train_cmd=run.pl -#export decode_cmd=run.pl -#njobs=2 +# this controls the number of parallel decoding jobs launched in run.sh if you +# are running locally (e.g. with run.pl) you can reduce it to control memory +# usage. +export njobs=20 diff --git a/egs/vystadial_en/s5/local/create_G.sh b/egs/vystadial_en/s5/local/create_G.sh index 7be19f7f03f..b462b9eab01 100755 --- a/egs/vystadial_en/s5/local/create_G.sh +++ b/egs/vystadial_en/s5/local/create_G.sh @@ -17,7 +17,7 @@ for lm in $LMs ; do lmp=$lmdir/`basename $lm` tmpdir=$tgt/tmp - mkdir -p $tgt + mkdir -p $tgt mkdir -p $tmpdir echo "--- Preparing the grammar transducer (G.fst) from $lmp in $tgt ..." @@ -26,21 +26,9 @@ for lm in $LMs ; do ln -s $langdir/$f $tgt/$f 2> /dev/null done - cat $lmp | utils/find_arpa_oovs.pl $tgt/words.txt > $tmpdir/oovs.txt - - # grep -v ' ' because the LM seems to have some strange and useless - # stuff in it with multiple 's in the history. Encountered some other similar - # things in a LM from Geoff. Removing all "illegal" combinations of and , - # which are supposed to occur only at being/end of utt. These can cause - # determinization failures of CLG [ends up being epsilon cycles]. - cat $lmp | \ - grep -v ' \| \| ' | \ - arpa2fst - | fstprint | \ - utils/remove_oovs.pl $tmpdir/oovs.txt | \ - utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$tgt/words.txt \ - --osymbols=$tgt/words.txt --keep_isymbols=false --keep_osymbols=false | \ - fstrmepsilon | fstarcsort --sort_type=ilabel > $tgt/G.fst + arpa2fst --disambig-symbol=#0 \ + --read-symbol-table=$tgt/words.txt - $tgt/G.fst fstisstochastic $tgt/G.fst # The output is like: # 9.14233e-05 -0.259833 @@ -48,7 +36,7 @@ for lm in $LMs ; do # nonzero because the backoff weights make the states sum to >1). # Because of the fiasco for these particular LMs, the first number is not # as close to zero as it could be. - + # Everything below is only for diagnostic. # Checking that G has no cycles with empty words on them (e.g. , ); # this might cause determinization failure of CLG. @@ -59,7 +47,7 @@ for lm in $LMs ; do fstcompile --isymbols=$tgt/words.txt --osymbols=$tgt/words.txt \ $tmpdir/g/select_empty.fst.txt | \ fstarcsort --sort_type=olabel | fstcompose - $tgt/G.fst > $tmpdir/g/empty_words.fst - fstinfo $tmpdir/g/empty_words.fst | grep cyclic | grep -w 'y' && + fstinfo $tmpdir/g/empty_words.fst | grep cyclic | grep -w 'y' && echo "Language model has cycles with empty words" && exit 1 # rm -rf $tmpdir # TODO debugging diff --git a/egs/vystadial_en/s5/path.sh b/egs/vystadial_en/s5/path.sh index d34cd4cbe5e..d864305627b 100755 --- a/egs/vystadial_en/s5/path.sh +++ b/egs/vystadial_en/s5/path.sh @@ -1,9 +1,10 @@ -# Needed for "correct" sorting +export KALDI_ROOT=`pwd`/../../.. +[ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh +export PATH=$KALDI_ROOT/tools/openfst/bin:$KALDI_ROOT/tools/irstlm/bin/:$PWD:$PWD/utils:$PWD/steps:$PATH +[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1 +. $KALDI_ROOT/tools/config/common_path.sh export LC_ALL=C -export KALDI_ROOT=../../.. -# adding Kaldi binaries to path -export PATH=$KALDI_ROOT/src/bin:$KALDI_ROOT/tools/openfst/bin:$KALDI_ROOT/tools/irstlm/bin/:$KALDI_ROOT/src/fstbin/:$KALDI_ROOT/src/gmmbin/:$KALDI_ROOT/src/featbin/:$KALDI_ROOT/src/lm/:$KALDI_ROOT/src/sgmmbin/:$KALDI_ROOT/src/sgmm2bin/:$KALDI_ROOT/src/fgmmbin/:$KALDI_ROOT/src/latbin/:$PWD:$PWD/utils:$PWD/steps:$PATH srilm_bin=$KALDI_ROOT/tools/srilm/bin/ diff --git a/egs/wsj/s5/RESULTS b/egs/wsj/s5/RESULTS index 2bb1d2124d0..acff4f9d7fe 100644 --- a/egs/wsj/s5/RESULTS +++ b/egs/wsj/s5/RESULTS @@ -89,22 +89,6 @@ exit 0 %WER 5.74 [ 324 / 5643, 46 ins, 41 del, 237 sub ] exp/tri3b/decode_bd_tgpr_eval92_fg/wer_19 %WER 5.90 [ 333 / 5643, 46 ins, 39 del, 248 sub ] exp/tri3b/decode_bd_tgpr_eval92_tg/wer_18 -# this section demonstrates RNNLM-HS rescoring (commented out by default) -# the exact results might differ insignificantly due to hogwild in RNNLM-HS training that introduces indeterminism -%WER 5.92 [ 334 / 5643, 58 ins, 32 del, 244 sub ] exp/tri3b/decode_bd_tgpr_eval92_fg/wer_14 # baseline (no rescoring) -%WER 5.26 [ 297 / 5643, 47 ins, 29 del, 221 sub ] exp/tri3b/decode_bd_tgpr_eval92_fg_rnnlm-hs100_0.3/wer_15 -%WER 5.17 [ 292 / 5643, 46 ins, 30 del, 216 sub ] exp/tri3b/decode_bd_tgpr_eval92_fg_rnnlm-hs300_0.3/wer_16 -%WER 5.64 [ 318 / 5643, 50 ins, 34 del, 234 sub ] exp/tri3b/decode_bd_tgpr_eval92_fg_rnnlm-hs30_0.15/wer_16 -%WER 5.55 [ 313 / 5643, 51 ins, 32 del, 230 sub ] exp/tri3b/decode_bd_tgpr_eval92_fg_rnnlm-hs400_0.15/wer_16 -%WER 5.55 [ 313 / 5643, 51 ins, 32 del, 230 sub ] exp/tri3b/decode_bd_tgpr_eval92_fg_rnnlm-hs400_0.15_N1000/wer_16 -%WER 5.39 [ 304 / 5643, 50 ins, 30 del, 224 sub ] exp/tri3b/decode_bd_tgpr_eval92_fg_rnnlm-hs400_0.3/wer_15 -%WER 5.42 [ 306 / 5643, 50 ins, 30 del, 226 sub ] exp/tri3b/decode_bd_tgpr_eval92_fg_rnnlm-hs400_0.3_N10/wer_15 -%WER 5.39 [ 304 / 5643, 50 ins, 30 del, 224 sub ] exp/tri3b/decode_bd_tgpr_eval92_fg_rnnlm-hs400_0.3_N1000/wer_15 -%WER 5.37 [ 303 / 5643, 49 ins, 29 del, 225 sub ] exp/tri3b/decode_bd_tgpr_eval92_fg_rnnlm-hs400_0.4/wer_14 -%WER 5.37 [ 303 / 5643, 49 ins, 29 del, 225 sub ] exp/tri3b/decode_bd_tgpr_eval92_fg_rnnlm-hs400_0.4_N1000/wer_14 -%WER 5.26 [ 297 / 5643, 45 ins, 32 del, 220 sub ] exp/tri3b/decode_bd_tgpr_eval92_fg_rnnlm-hs400_0.5_N1000/wer_15 -%WER 5.14 [ 290 / 5643, 43 ins, 32 del, 215 sub ] exp/tri3b/decode_bd_tgpr_eval92_fg_rnnlm-hs400_0.75_N1000/wer_18 - %WER 14.17 [ 1167 / 8234, 222 ins, 123 del, 822 sub ] exp/tri3b/decode_tgpr_dev93/wer_17 %WER 19.37 [ 1595 / 8234, 315 ins, 153 del, 1127 sub ] exp/tri3b/decode_tgpr_dev93.si/wer_15 @@ -277,4 +261,53 @@ for x in exp/nnet2_online/nnet_ms_a_online/decode_*; do grep WER $x/wer_* | util %WER 6.68 [ 377 / 5643, 102 ins, 13 del, 262 sub ] exp/nnet2_online/nnet_ms_a_online/decode_tgpr_eval92_utt/wer_10 %WER 6.56 [ 370 / 5643, 100 ins, 12 del, 258 sub ] exp/nnet2_online/nnet_ms_a_online/decode_tgpr_eval92_utt_offline/wer_10 - +# RNNLM n-best rescoring with Mikolov's model: +for x in exp/nnet2_online/nnet_ms_a_online/decode_*rnnlm.h300.voc40k; do grep WER $x/wer_* | utils/best_wer.sh ; done +%WER 5.60 [ 461 / 8234, 51 ins, 70 del, 340 sub ] exp/nnet2_online/nnet_ms_a_online/decode_bd_tgpr_dev93_rnnlm.h300.voc40k/wer_15_0.0 +%WER 2.64 [ 149 / 5643, 21 ins, 13 del, 115 sub ] exp/nnet2_online/nnet_ms_a_online/decode_bd_tgpr_eval92_rnnlm.h300.voc40k/wer_11_0.5 +%WER 8.16 [ 672 / 8234, 136 ins, 70 del, 466 sub ] exp/nnet2_online/nnet_ms_a_online/decode_tgpr_dev93_rnnlm.h300.voc40k/wer_14_0.5 +%WER 5.39 [ 304 / 5643, 74 ins, 16 del, 214 sub ] exp/nnet2_online/nnet_ms_a_online/decode_tgpr_eval92_rnnlm.h300.voc40k/wer_17_0.5 + +# RNNLM lattice rescoring with Mikolov's model: +for x in exp/nnet2_online/nnet_ms_a_online/decode_*rnnlm.h300.voc40k_lat; do grep WER $x/wer_* | utils/best_wer.sh ; done +%WER 5.05 [ 416 / 8234, 47 ins, 72 del, 297 sub ] exp/nnet2_online/nnet_ms_a_online/decode_bd_tgpr_dev93_rnnlm.h300.voc40k_lat/wer_16_0.0 +%WER 2.59 [ 146 / 5643, 19 ins, 14 del, 113 sub ] exp/nnet2_online/nnet_ms_a_online/decode_bd_tgpr_eval92_rnnlm.h300.voc40k_lat/wer_10_0.5 +%WER 7.70 [ 634 / 8234, 133 ins, 67 del, 434 sub ] exp/nnet2_online/nnet_ms_a_online/decode_tgpr_dev93_rnnlm.h300.voc40k_lat/wer_13_0.5 +%WER 5.25 [ 296 / 5643, 81 ins, 14 del, 201 sub ] exp/nnet2_online/nnet_ms_a_online/decode_tgpr_eval92_rnnlm.h300.voc40k_lat/wer_14_0.5 + +# RNNLM n-best rescoring with Yandex's model: +for x in exp/nnet2_online/nnet_ms_a_online/decode_*rnnlm-hs.nce20.h400.voc40k; do grep WER $x/wer_* | utils/best_wer.sh ; done +%WER 5.31 [ 437 / 8234, 50 ins, 66 del, 321 sub ] exp/nnet2_online/nnet_ms_a_online/decode_bd_tgpr_dev93_rnnlm-hs.nce20.h400.voc40k/wer_13_0.0 +%WER 2.91 [ 164 / 5643, 24 ins, 9 del, 131 sub ] exp/nnet2_online/nnet_ms_a_online/decode_bd_tgpr_eval92_rnnlm-hs.nce20.h400.voc40k/wer_10_0.0 +%WER 7.83 [ 645 / 8234, 159 ins, 50 del, 436 sub ] exp/nnet2_online/nnet_ms_a_online/decode_tgpr_dev93_rnnlm-hs.nce20.h400.voc40k/wer_11_0.0 +%WER 5.40 [ 305 / 5643, 77 ins, 16 del, 212 sub ] exp/nnet2_online/nnet_ms_a_online/decode_tgpr_eval92_rnnlm-hs.nce20.h400.voc40k/wer_13_1.0 + +# TDNN results: +for x in exp/nnet3/nnet_tdnn_a/decode_*; do grep WER $x/wer_* | utils/best_wer.sh ; done +%WER 7.19 [ 592 / 8234, 51 ins, 109 del, 432 sub ] exp/nnet3/nnet_tdnn_a/decode_bd_tgpr_dev93/wer_13_0.5 +%WER 3.93 [ 222 / 5643, 23 ins, 20 del, 179 sub ] exp/nnet3/nnet_tdnn_a/decode_bd_tgpr_eval92/wer_10_1.0 +%WER 9.78 [ 805 / 8234, 167 ins, 72 del, 566 sub ] exp/nnet3/nnet_tdnn_a/decode_tgpr_dev93/wer_10_0.0 +%WER 6.40 [ 361 / 5643, 87 ins, 16 del, 258 sub ] exp/nnet3/nnet_tdnn_a/decode_tgpr_eval92/wer_10_1.0 + +# local/nnet3/run_lstm.sh +# LSTM results: cell_dim=1024, recurrent_projection_dim=non_recurrent_projection_dim=256,lstm_delay=-1 -2 -3, label_delay=5, num_params=11894059 +%WER 7.32 exp/nnet3/lstm_ld5/decode_bd_tgpr_dev93/wer_11_0.0 +%WER 4.24 exp/nnet3/lstm_ld5/decode_bd_tgpr_eval92/wer_10_1.0 +%WER 9.57 exp/nnet3/lstm_ld5/decode_tgpr_dev93/wer_9_1.0 +%WER 6.86 exp/nnet3/lstm_ld5/decode_tgpr_eval92/wer_10_1.0 + +# bidirectional LSTM +# ----------------------- +# local/nnet3/run_lstm.sh --affix bidirectional \ +# --lstm-delay " [-1,1] [-2,2] [-3,3] " \ +# --label-delay 0 \ +# --cell-dim 640 \ +# --recurrent-projection-dim 128 \ +# --non-recurrent-projection-dim 128 \ +# --chunk-left-context 40 \ +# --chunk-right-context 40 +# num_params=11485739 +%WER 6.81 exp/nnet3/lstm_bidirectional/decode_bd_tgpr_dev93/wer_11_0.0 +%WER 4.27 exp/nnet3/lstm_bidirectional/decode_bd_tgpr_eval92/wer_11_0.0 +%WER 9.29 exp/nnet3/lstm_bidirectional/decode_tgpr_dev93/wer_11_0.5 +%WER 6.61 exp/nnet3/lstm_bidirectional/decode_tgpr_eval92/wer_11_1.0 diff --git a/egs/wsj/s5/cmd.sh b/egs/wsj/s5/cmd.sh index 00aa0c145a3..537c46ba4f2 100644 --- a/egs/wsj/s5/cmd.sh +++ b/egs/wsj/s5/cmd.sh @@ -1,30 +1,29 @@ -# "queue.pl" uses qsub. The options to it are -# options to qsub. If you have GridEngine installed, -# change this to a queue you have access to. -# Otherwise, use "run.pl", which will run jobs locally -# (make sure your --num-jobs options are no more than -# the number of cpus on your machine. +# you can change cmd.sh depending on what type of queue you are using. +# If you have no queueing system and want to run on a local machine, you +# can change all instances 'queue.pl' to run.pl (but be careful and run +# commands one by one: most recipes will exhaust the memory on your +# machine). queue.pl works with GridEngine (qsub). slurm.pl works +# with slurm. Different queues are configured differently, with different +# queue names and different ways of specifying things like memory; +# to account for these differences you can create and edit the file +# conf/queue.conf to match your queue's configuration. Search for +# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, +# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. -#a) JHU cluster options -export train_cmd="queue.pl -l arch=*64" -export decode_cmd="queue.pl -l arch=*64 --mem 2G" -export mkgraph_cmd="queue.pl -l arch=*64 --mem 4G" -export big_memory_cmd="queue.pl -l arch=*64 --mem 8G" -export cuda_cmd="queue.pl -l gpu=1" - -#b) run it locally... -#export train_cmd=run.pl -#export decode_cmd=run.pl -#export cuda_cmd=run.pl -#export mkgraph_cmd=run.pl +export train_cmd=queue.pl +export decode_cmd="queue.pl --mem 2G" +export mkgraph_cmd="queue.pl --mem 4G" +# the use of cuda_cmd is deprecated. +export cuda_cmd="queue.pl --gpu 1" +# the rest of this file is present for historical reasons. +# It's better to use conf/queue.conf for cluster-specific configuration. #c) BUT cluster: if [ "$(hostname -d)" == "fit.vutbr.cz" ]; then queue="all.q@@blade,all.q@@speech" - gpu_queue="long.q@supergpu*,long.q@dellgpu*,long.q@pcspeech-gpu,long.q@pcgpu*" + gpu_queue="long.q@@gpu" storage="matylda5" - export train_cmd="queue.pl -q $queue -l ram_free=1500M,mem_free=1500M,${storage}=1" - export decode_cmd="queue.pl -q $queue -l ram_free=2500M,mem_free=2500M,${storage}=0.5" - export cuda_cmd="queue.pl -q $gpu_queue -l gpu=1" + export train_cmd="queue.pl -q $queue -l ram_free=1.5G,mem_free=1.5G,${storage}=1" + export decode_cmd="queue.pl -q $queue -l ram_free=2.5G,mem_free=2.5G,${storage}=0.5" + export cuda_cmd="queue.pl -q $gpu_queue -l gpu=1" fi - diff --git a/egs/wsj/s5/local/cstr_wsj_extend_dict.sh b/egs/wsj/s5/local/cstr_wsj_extend_dict.sh index b2a9faad704..8004db1d924 100755 --- a/egs/wsj/s5/local/cstr_wsj_extend_dict.sh +++ b/egs/wsj/s5/local/cstr_wsj_extend_dict.sh @@ -12,6 +12,11 @@ # way. # It makes use of scripts in local/dict/ +dict_suffix= + +echo "$0 $@" # Print the command line for logging +. utils/parse_options.sh || exit 1; + if [ $# -ne 1 ]; then echo "Usage: local/cstr_wsj_train_lms.sh WSJ1_doc_dir" exit 1 @@ -25,19 +30,20 @@ if [ ! -d $srcdir/lng_modl ]; then exit 1 fi -mkdir -p data/local/dict_larger -dir=data/local/dict_larger -cp data/local/dict/* data/local/dict_larger # Various files describing phones etc. +mkdir -p data/local/dict${dict_suffix}_larger +dir=data/local/dict${dict_suffix}_larger +cp data/local/dict${dict_suffix}/* data/local/dict${dict_suffix}_larger # Various files describing phones etc. # are there; we just want to copy them as the phoneset is the same. -rm data/local/dict_larger/lexicon.txt # we don't want this. +rm data/local/dict${dict_suffix}_larger/lexicon.txt # we don't want this. +rm data/local/dict${dict_suffix}_larger/lexiconp.txt # we don't want this either. mincount=2 # Minimum count of an OOV we will try to generate a pron for. -[ ! -f data/local/dict/cmudict/cmudict.0.7a ] && echo "CMU dict not in expected place" && exit 1; +[ ! -f data/local/dict${dict_suffix}/cmudict/cmudict.0.7a ] && echo "CMU dict not in expected place" && exit 1; # Remove comments from cmudict; print first field; remove # words like FOO(1) which are alternate prons: our dict format won't # include these markers. -grep -v ';;;' data/local/dict/cmudict/cmudict.0.7a | +grep -v ';;;' data/local/dict${dict_suffix}/cmudict/cmudict.0.7a | perl -ane 's/^(\S+)\(\d+\)/$1/; print; ' | sort | uniq > $dir/dict.cmu cat $dir/dict.cmu | awk '{print $1}' | sort | uniq > $dir/wordlist.cmu diff --git a/egs/wsj/s5/local/nnet2/run_5e_gpu.sh b/egs/wsj/s5/local/nnet2/run_5e_gpu.sh index d8ee94dd291..bae7327788e 100755 --- a/egs/wsj/s5/local/nnet2/run_5e_gpu.sh +++ b/egs/wsj/s5/local/nnet2/run_5e_gpu.sh @@ -15,14 +15,14 @@ dir=exp/nnet5e_gpu # learning rate, relative to run_5c.sh . ././cmd.sh . ./path.sh -! cuda-compiled && cat </dev/null +local/wsj_train_rnnlms.sh --dict-suffix "_nosp" \ + --cmd "$decode_cmd --mem 16G" \ + --hidden 300 --nwords 40000 --class 400 \ + --direct 2000 data/local/rnnlm.h300.voc40k \ + || touch data/local/rnnlm.h300.voc40k/.error & + +# Trains Yandex's version, which takes roughly 10 hours with the following +# parameter setting. We start from the dictionary directory without silence +# probabilities (with suffix "_nosp"). +num_threads_rnnlm=8 +rm data/local/rnnlm-hs.nce20.h400.voc40k/.error 2>/dev/null +local/wsj_train_rnnlms.sh --dict-suffix "_nosp" \ + --rnnlm_ver faster-rnnlm --threads $num_threads_rnnlm \ + --cmd "$decode_cmd --mem 8G --num-threads $num_threads_rnnlm" \ + --bptt 4 --bptt-block 10 --hidden 400 --nwords 40000 --direct 2000 \ + --rnnlm-options "-direct-order 4 -nce 20" \ + data/local/rnnlm-hs.nce20.h400.voc40k \ + || touch data/local/rnnlm-hs.nce20.h400.voc40k/.error & + +wait; + +# Rescoring. We demonstrate results on the TDNN models. Make sure you have +# finished running the following scripts: +# local/online/run_nnet2.sh +# local/online/run_nnet2_baseline.sh +# local/online/run_nnet2_discriminative.sh +for lm_suffix in tgpr bd_tgpr; do + graph_dir=exp/tri4b/graph_${lm_suffix} + for year in eval92 dev93; do + decode_dir=exp/nnet2_online/nnet_ms_a_online/decode_${lm_suffix}_${year} + + # N-best rescoring with Tomas Mikolov's version. + steps/rnnlmrescore.sh \ + --N 1000 --cmd "$decode_cmd --mem 16G" --inv-acwt 10 0.75 \ + data/lang_test_${lm_suffix} data/local/rnnlm.h300.voc40k \ + data/test_${year} ${decode_dir} \ + ${decode_dir}_rnnlm.h300.voc40k || exit 1; + + # Lattice rescoring with Tomas Mikolov's version. + steps/lmrescore_rnnlm_lat.sh \ + --weight 0.75 --cmd "$decode_cmd --mem 16G" --max-ngram-order 5 \ + data/lang_test_${lm_suffix} data/local/rnnlm.h300.voc40k \ + data/test_${year} ${decode_dir} \ + ${decode_dir}_rnnlm.h300.voc40k_lat || exit 1; + + # N-best rescoring with Yandex's version. + steps/rnnlmrescore.sh --rnnlm_ver faster-rnnlm \ + --N 1000 --cmd "$decode_cmd --mem 8G" --inv-acwt 10 0.75 \ + data/lang_test_${lm_suffix} data/local/rnnlm-hs.nce20.h400.voc40k \ + data/test_${year} ${decode_dir} \ + ${decode_dir}_rnnlm-hs.nce20.h400.voc40k || exit 1; + done +done diff --git a/egs/wsj/s5/local/run_rnnlms_sgmm5b.sh b/egs/wsj/s5/local/run_rnnlms_sgmm5b.sh deleted file mode 100755 index 67fcee50a93..00000000000 --- a/egs/wsj/s5/local/run_rnnlms_sgmm5b.sh +++ /dev/null @@ -1,42 +0,0 @@ -#!/bin/bash - -for test in dev93 eval92; do - - steps/lmrescore.sh --cmd "$decode_cmd" data/lang_test_bd_tgpr data/lang_test_bd_fg \ - data/test_${test} exp/sgmm5b_mmi_b0.1/decode_bd_tgpr_${test}_it4 exp/sgmm5b_mmi_b0.1/decode_bd_fg_${test}_it4 || exit 1; - - -# Note: for N-best-list generation, choosing the acoustic scale (12) that gave -# the best WER on this test set. Ideally we should do this on a dev set. - - # This step interpolates a small RNNLM (with weight 0.25) with the 4-gram LM. - steps/rnnlmrescore.sh \ - --N 100 --cmd "$decode_cmd" --inv-acwt 12 \ - 0.25 data/lang_test_bd_fg data/local/rnnlm.h30.voc10k data/test_${test} \ - exp/sgmm5b_mmi_b0.1/decode_bd_fg_${test}_it4 exp/sgmm5b_mmi_b0.1/decode_bd_fg_${test}_it4_rnnlm30_0.25 \ - || exit 1; - - steps/rnnlmrescore.sh \ - --N 100 --cmd "$decode_cmd" --inv-acwt 12 \ - 0.5 data/lang_test_bd_fg data/local/rnnlm.h100.voc20k data/test_${test} \ - exp/sgmm5b_mmi_b0.1/decode_bd_fg_${test}_it4 exp/sgmm5b_mmi_b0.1/decode_bd_fg_${test}_it4_rnnlm100_0.5 \ - || exit 1; - - steps/rnnlmrescore.sh \ - --N 100 --cmd "$decode_cmd" --inv-acwt 12 \ - 0.5 data/lang_test_bd_fg data/local/rnnlm.h200.voc30k data/test_${test} \ - exp/sgmm5b_mmi_b0.1/decode_bd_fg_${test}_it4 exp/sgmm5b_mmi_b0.1/decode_bd_fg_${test}_it4_rnnlm200_0.5 \ - || exit 1; - - steps/rnnlmrescore.sh \ - --N 100 --cmd "$decode_cmd" --inv-acwt 12 \ - 0.5 data/lang_test_bd_fg data/local/rnnlm.h300.voc40k data/test_${test} \ - exp/sgmm5b_mmi_b0.1/decode_bd_fg_${test}_it4 exp/sgmm5b_mmi_b0.1/decode_bd_fg_${test}_it4_rnnlm300_0.5 \ - || exit 1; - - steps/rnnlmrescore.sh \ - --N 100 --cmd "$decode_cmd" --inv-acwt 12 \ - 0.75 data/lang_test_bd_fg data/local/rnnlm.h300.voc40k data/test_${test} \ - exp/sgmm5b_mmi_b0.1/decode_bd_fg_${test}_it4 exp/sgmm5b_mmi_b0.1/decode_bd_fg_${test}_it4_rnnlm300_0.75 \ - || exit 1; -done diff --git a/egs/wsj/s5/local/run_rnnlms_tri3b.sh b/egs/wsj/s5/local/run_rnnlms_tri3b.sh deleted file mode 100755 index 5d056860848..00000000000 --- a/egs/wsj/s5/local/run_rnnlms_tri3b.sh +++ /dev/null @@ -1,80 +0,0 @@ -#!/bin/bash - -lang_suffix= - -echo "$0 $@" # Print the command line for logging -. utils/parse_options.sh || exit 1; - -. cmd.sh - - # This step interpolates a small RNNLM (with weight 0.25) with the 4-gram LM. -steps/rnnlmrescore.sh \ - --N 100 --cmd "$decode_cmd" --inv-acwt 17 \ - 0.25 data/lang${lang_suffix}_test_bd_fg \ - data/local/rnnlm.h30.voc10k data/test_eval92 \ - exp/tri3b/decode${lang_suffix}_bd_tgpr_eval92_fg \ - exp/tri3b/decode${lang_suffix}_bd_tgpr_eval92_fg_rnnlm30_0.25 || exit 1; - -steps/rnnlmrescore.sh \ - --N 100 --cmd "$decode_cmd" --inv-acwt 17 \ - 0.5 data/lang${lang_suffix}_test_bd_fg \ - data/local/rnnlm.h100.voc20k data/test_eval92 \ - exp/tri3b/decode${lang_suffix}_bd_tgpr_eval92_fg \ - exp/tri3b/decode${lang_suffix}_bd_tgpr_eval92_fg_rnnlm100_0.5 || exit 1; - -steps/rnnlmrescore.sh \ - --N 100 --cmd "$decode_cmd" --inv-acwt 17 \ - 0.5 data/lang${lang_suffix}_test_bd_fg \ - data/local/rnnlm.h200.voc30k data/test_eval92 \ - exp/tri3b/decode${lang_suffix}_bd_tgpr_eval92_fg \ - exp/tri3b/decode${lang_suffix}_bd_tgpr_eval92_fg_rnnlm200_0.5 || exit 1; - -steps/rnnlmrescore.sh \ - --N 100 --cmd "$decode_cmd" --inv-acwt 17 \ - 0.5 data/lang${lang_suffix}_test_bd_fg \ - data/local/rnnlm.h300.voc40k data/test_eval92 \ - exp/tri3b/decode${lang_suffix}_bd_tgpr_eval92_fg \ - exp/tri3b/decode${lang_suffix}_bd_tgpr_eval92_fg_rnnlm300_0.5 || exit 1; - -steps/rnnlmrescore.sh \ - --N 1000 --cmd "$decode_cmd" --inv-acwt 17 \ - 0.5 data/lang${lang_suffix}_test_bd_fg \ - data/local/rnnlm.h300.voc40k data/test_eval92 \ - exp/tri3b/decode${lang_suffix}_bd_tgpr_eval92_fg \ - exp/tri3b/decode${lang_suffix}_bd_tgpr_eval92_fg_rnnlm300_0.5_N1000 - -dir=exp/tri3b/decode${lang_suffix}_bd_tgpr_eval92_fg_rnnlm300_0.75_N1000 -rm -rf $dir -cp -r exp/tri3b/decode${lang_suffix}_bd_tgpr_eval92_fg_rnnlm300_0.5_N1000 $dir -steps/rnnlmrescore.sh \ - --stage 7 --N 1000 --cmd "$decode_cmd" --inv-acwt 17 \ - 0.75 data/lang${lang_suffix}_test_bd_fg \ - data/local/rnnlm.h300.voc40k data/test_eval92 \ - exp/tri3b/decode${lang_suffix}_bd_tgpr_eval92_fg $dir - -dir=exp/tri3b/decode${lang_suffix}_bd_tgpr_eval92_fg_rnnlm300_0.75 -rm -rf $dir -cp -r exp/tri3b/decode${lang_suffix}_bd_tgpr_eval92_fg_rnnlm300_0.5 $dir -steps/rnnlmrescore.sh \ - --stage 7 --N 100 --cmd "$decode_cmd" --inv-acwt 17 \ - 0.75 data/lang${lang_suffix}_test_bd_fg \ - data/local/rnnlm.h300.voc40k data/test_eval92 \ - exp/tri3b/decode${lang_suffix}_bd_tgpr_eval92_fg $dir - -dir=exp/tri3b/decode${lang_suffix}_bd_tgpr_eval92_fg_rnnlm300_0.25 -rm -rf $dir -cp -r exp/tri3b/decode${lang_suffix}_bd_tgpr_eval92_fg_rnnlm300_0.5 $dir -steps/rnnlmrescore.sh \ - --stage 7 --N 100 --cmd "$decode_cmd" --inv-acwt 17 \ - 0.25 data/lang${lang_suffix}_test_bd_fg \ - data/local/rnnlm.h300.voc40k data/test_eval92 \ - exp/tri3b/decode${lang_suffix}_bd_tgpr_eval92_fg $dir - -steps/rnnlmrescore.sh \ - --N 10 --cmd "$decode_cmd" --inv-acwt 17 \ - 0.5 data/lang${lang_suffix}_test_bd_fg \ - data/local/rnnlm.h300.voc40k data/test_eval92 \ - exp/tri3b/decode${lang_suffix}_bd_tgpr_eval92_fg \ - exp/tri3b/decode${lang_suffix}_bd_tgpr_eval92_fg_rnnlm300_0.5_N10 \ - || exit 1; - diff --git a/egs/wsj/s5/local/wsj_data_prep.sh b/egs/wsj/s5/local/wsj_data_prep.sh index 3463747138a..edb9e6f2e3a 100755 --- a/egs/wsj/s5/local/wsj_data_prep.sh +++ b/egs/wsj/s5/local/wsj_data_prep.sh @@ -48,6 +48,8 @@ if [ ! -d links/11-13.1 -o ! -d links/13-34.1 -o ! -d links/11-2.1 ]; then echo "wsj_data_prep.sh: Spot check of command line arguments failed" echo "Command line arguments must be absolute pathnames to WSJ directories" echo "with names like 11-13.1." + echo "Note: if you have old-style WSJ distribution," + echo "local/cstr_wsj_data_prep.sh may work instead, see run.sh for example." exit 1; fi @@ -70,14 +72,14 @@ nl=`cat train_si284.flist | wc -l` [ "$nl" -eq 37416 ] || echo "Warning: expected 37416 lines in train_si284.flist, got $nl" # Now for the test sets. -# links/13-34.1/wsj1/doc/indices/readme.doc +# links/13-34.1/wsj1/doc/indices/readme.doc # describes all the different test sets. # Note: each test-set seems to come in multiple versions depending # on different vocabulary sizes, verbalized vs. non-verbalized # pronunciations, etc. We use the largest vocab and non-verbalized # pronunciations. # The most normal one seems to be the "baseline 60k test set", which -# is h1_p0. +# is h1_p0. # Nov'92 (333 utts) # These index files have a slightly different format; @@ -113,8 +115,8 @@ cat links/13-34.1/wsj1/doc/indices/h2_p0.ndx | \ # Dev-set Hub 1,2 (503, 913 utterances) -# Note: the ???'s below match WSJ and SI_DT, or wsj and si_dt. -# Sometimes this gets copied from the CD's with upcasing, don't know +# Note: the ???'s below match WSJ and SI_DT, or wsj and si_dt. +# Sometimes this gets copied from the CD's with upcasing, don't know # why (could be older versions of the disks). find `readlink links/13-16.1`/???1/??_??_20 -print | grep -i ".wv1" | sort > dev_dt_20.flist find `readlink links/13-16.1`/???1/??_??_05 -print | grep -i ".wv1" | sort > dev_dt_05.flist @@ -136,7 +138,7 @@ noiseword=""; for x in train_si84 train_si284 test_eval92 test_eval93 test_dev93 test_eval92_5k test_eval93_5k test_dev93_5k dev_dt_05 dev_dt_20; do cat $x.trans1 | $local/normalize_transcript.pl $noiseword | sort > $x.txt || exit 1; done - + # Create scp's with wav's. (the wv1 in the distribution is not really wav, it is sph.) for x in train_si84 train_si284 test_eval92 test_eval93 test_dev93 test_eval92_5k test_eval93_5k test_dev93_5k dev_dt_05 dev_dt_20; do awk '{printf("%s '$sph2pipe' -f wav %s |\n", $1, $2);}' < ${x}_sph.scp > ${x}_wav.scp @@ -186,19 +188,19 @@ if [ ! -f wsj0-train-spkrinfo.txt ] || [ `cat wsj0-train-spkrinfo.txt | wc -l` - rm wsj0-train-spkrinfo.txt ! wget http://www.ldc.upenn.edu/Catalog/docs/LDC93S6A/wsj0-train-spkrinfo.txt && \ echo "Getting wsj0-train-spkrinfo.txt from backup location" && \ - wget --no-check-certificate https://sourceforge.net/projects/kaldi/files/wsj0-train-spkrinfo.txt + wget --no-check-certificate https://sourceforge.net/projects/kaldi/files/wsj0-train-spkrinfo.txt fi if [ ! -f wsj0-train-spkrinfo.txt ]; then echo "Could not get the spkrinfo.txt file from LDC website (moved)?" - echo "This is possibly omitted from the training disks; couldn't find it." + echo "This is possibly omitted from the training disks; couldn't find it." echo "Everything else may have worked; we just may be missing gender info" echo "which is only needed for VTLN-related diagnostics anyway." exit 1 fi # Note: wsj0-train-spkrinfo.txt doesn't seem to be on the disks but the # LDC put it on the web. Perhaps it was accidentally omitted from the -# disks. +# disks. cat links/11-13.1/wsj0/doc/spkrinfo.txt \ links/13-32.1/wsj1/doc/evl_spok/spkrinfo.txt \ diff --git a/egs/wsj/s5/local/wsj_extend_dict.sh b/egs/wsj/s5/local/wsj_extend_dict.sh index 160d866843a..c2b11b8dc8b 100755 --- a/egs/wsj/s5/local/wsj_extend_dict.sh +++ b/egs/wsj/s5/local/wsj_extend_dict.sh @@ -1,6 +1,6 @@ #!/bin/bash -# This script builds a larger word-list and dictionary +# This script builds a larger word-list and dictionary # than used for the LMs supplied with the WSJ corpus. # It uses a couple of strategies to fill-in words in # the LM training data but not in CMUdict. One is @@ -23,6 +23,8 @@ if [ $# -ne 1 ]; then fi if [ "`basename $1`" != 13-32.1 ]; then echo "Expecting the argument to this script to end in 13-32.1" + echo "Note: if you have old-style WSJ distribution," + echo "local/cstr_wsj_extend_dict.sh may work instead, see run.sh for example." exit 1 fi @@ -46,7 +48,7 @@ mincount=2 # Minimum count of an OOV we will try to generate a pron for. # Remove comments from cmudict; print first field; remove # words like FOO(1) which are alternate prons: our dict format won't # include these markers. -grep -v ';;;' data/local/dict${dict_suffix}/cmudict/cmudict.0.7a | +grep -v ';;;' data/local/dict${dict_suffix}/cmudict/cmudict.0.7a | perl -ane 's/^(\S+)\(\d+\)/$1/; print; ' | sort | uniq > $dir/dict.cmu cat $dir/dict.cmu | awk '{print $1}' | sort | uniq > $dir/wordlist.cmu @@ -67,8 +69,8 @@ else | awk '/^){ chop; $isword{$_} = 1; } - while() { - @A = split(" ", $_); + while() { + @A = split(" ", $_); for ($n = 0; $n < @A; $n++) { $a = $A[$n]; if (! $isword{$a} && $a =~ s/^([^\.]+)\.$/$1/) { # nonwords that end in "." @@ -81,7 +83,7 @@ else } ' $dir/wordlist.cmu | gzip -c > $dir/cleaned.gz fi - + # get unigram counts echo "Getting unigram counts" gunzip -c $dir/cleaned.gz | tr -s ' ' '\n' | \ @@ -105,7 +107,7 @@ cat $dir/oov.counts | awk -v thresh=$mincount '{if ($1 >= thresh) { print $2; }} # First make some prons for possible acronyms. # Note: we don't do this for things like U.K or U.N, -# or A.B. (which doesn't exist anyway), +# or A.B. (which doesn't exist anyway), # as we consider this normalization/spelling errors. cat $dir/oovlist | local/dict/get_acronym_prons.pl $dir/dict.cmu > $dir/dict.acronyms @@ -118,7 +120,7 @@ mkdir $dir/f $dir/b # forward, backward directions of rules... # Remove ; and , from words, if they are present; these # might crash our scripts, as they are used as separators there. -filter_dict.pl $dir/dict.cmu > $dir/f/dict +filter_dict.pl $dir/dict.cmu > $dir/f/dict cat $dir/oovlist | filter_dict.pl > $dir/f/oovs reverse_dict.pl $dir/f/dict > $dir/b/dict reverse_dict.pl $dir/f/oovs > $dir/b/oovs @@ -140,8 +142,8 @@ for d in $dir/f $dir/b; do score_rules.pl rules.with_scores get_candidate_prons.pl rules.with_scores dict oovs | \ limit_candidate_prons.pl hierarchy > oovs.candidates - ) & -done + ) & +done wait # Merge the candidates. @@ -159,9 +161,9 @@ sort $dir/oovlist | diff - $dir/oovlist.handled | grep -v 'd' | sed 's:< ::' > add_counts.pl $dir/oov.counts $dir/oovlist.handled | sort -nr > $dir/oovlist.handled.counts add_counts.pl $dir/oov.counts $dir/oovlist.not_handled | sort -nr > $dir/oovlist.not_handled.counts -echo "**Top OOVs we handled are:**"; +echo "**Top OOVs we handled are:**"; head $dir/oovlist.handled.counts -echo "**Top OOVs we didn't handle are as follows (note: they are mostly misspellings):**"; +echo "**Top OOVs we didn't handle are as follows (note: they are mostly misspellings):**"; head $dir/oovlist.not_handled.counts diff --git a/egs/wsj/s5/local/wsj_format_data.sh b/egs/wsj/s5/local/wsj_format_data.sh index c476e83ee6f..897b904db83 100755 --- a/egs/wsj/s5/local/wsj_format_data.sh +++ b/egs/wsj/s5/local/wsj_format_data.sh @@ -27,7 +27,7 @@ tmpdir=data/local/lm_tmp lexicon=data/local/lang${lang_suffix}_tmp/lexiconp.txt mkdir -p $tmpdir -for x in train_si284 test_eval92 test_eval93 test_dev93 test_eval92_5k test_eval93_5k test_dev93_5k dev_dt_05 dev_dt_20; do +for x in train_si284 test_eval92 test_eval93 test_dev93 test_eval92_5k test_eval93_5k test_dev93_5k dev_dt_05 dev_dt_20; do mkdir -p data/$x cp $srcdir/${x}_wav.scp data/$x/wav.scp || exit 1; cp $srcdir/$x.txt data/$x/text || exit 1; @@ -49,22 +49,8 @@ for lm_suffix in bg tgpr tg bg_5k tgpr_5k tg_5k; do cp -r data/lang${lang_suffix}/* $test || exit 1; gunzip -c $lmdir/lm_${lm_suffix}.arpa.gz | \ - utils/find_arpa_oovs.pl $test/words.txt > $tmpdir/oovs_${lm_suffix}.txt - - # grep -v ' ' because the LM seems to have some strange and useless - # stuff in it with multiple 's in the history. Encountered some other similar - # things in a LM from Geoff. Removing all "illegal" combinations of and , - # which are supposed to occur only at being/end of utt. These can cause - # determinization failures of CLG [ends up being epsilon cycles]. - gunzip -c $lmdir/lm_${lm_suffix}.arpa.gz | \ - grep -v ' ' | \ - grep -v ' ' | \ - grep -v ' ' | \ - arpa2fst - | fstprint | \ - utils/remove_oovs.pl $tmpdir/oovs_${lm_suffix}.txt | \ - utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$test/words.txt \ - --osymbols=$test/words.txt --keep_isymbols=false --keep_osymbols=false | \ - fstrmepsilon | fstarcsort --sort_type=ilabel > $test/G.fst + arpa2fst --disambig-symbol=#0 \ + --read-symbol-table=$test/words.txt - $test/G.fst utils/validate_lang.pl --skip-determinization-check $test || exit 1; done diff --git a/egs/wsj/s5/local/wsj_format_local_lms.sh b/egs/wsj/s5/local/wsj_format_local_lms.sh index 22493fbe963..c415a806fff 100755 --- a/egs/wsj/s5/local/wsj_format_local_lms.sh +++ b/egs/wsj/s5/local/wsj_format_local_lms.sh @@ -45,17 +45,13 @@ fi # Be careful: this time we dispense with the grep -v ' ' so this might # not work for LMs generated from all toolkits. gunzip -c $lm_srcdir_3g/lm_pr6.0.gz | \ - arpa2fst - | fstprint | \ - utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$lang/words.txt \ - --osymbols=$lang/words.txt --keep_isymbols=false --keep_osymbols=false | \ - fstrmepsilon | fstarcsort --sort_type=ilabel > data/lang${lang_suffix}_test_bd_tgpr/G.fst || exit 1; + arpa2fst --disambig-symbol=#0 \ + --read-symbol-table=$lang/words.txt - data/lang${lang_suffix}_test_bd_tgpr/G.fst || exit 1; fstisstochastic data/lang${lang_suffix}_test_bd_tgpr/G.fst gunzip -c $lm_srcdir_3g/lm_unpruned.gz | \ - arpa2fst - | fstprint | \ - utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$lang/words.txt \ - --osymbols=$lang/words.txt --keep_isymbols=false --keep_osymbols=false | \ - fstrmepsilon | fstarcsort --sort_type=ilabel > data/lang${lang_suffix}_test_bd_tg/G.fst || exit 1; + arpa2fst --disambig-symbol=#0 \ + --read-symbol-table=$lang/words.txt - data/lang${lang_suffix}_test_bd_tg/G.fst || exit 1; fstisstochastic data/lang${lang_suffix}_test_bd_tg/G.fst # Build ConstArpaLm for the unpruned language model. @@ -65,10 +61,8 @@ gunzip -c $lm_srcdir_3g/lm_unpruned.gz | \ --unk-symbol=$unk - data/lang${lang_suffix}_test_bd_tgconst/G.carpa || exit 1 gunzip -c $lm_srcdir_4g/lm_unpruned.gz | \ - arpa2fst - | fstprint | \ - utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$lang/words.txt \ - --osymbols=$lang/words.txt --keep_isymbols=false --keep_osymbols=false | \ - fstrmepsilon | fstarcsort --sort_type=ilabel > data/lang${lang_suffix}_test_bd_fg/G.fst || exit 1; + arpa2fst --disambig-symbol=#0 \ + --read-symbol-table=$lang/words.txt - data/lang${lang_suffix}_test_bd_fg/G.fst || exit 1; fstisstochastic data/lang${lang_suffix}_test_bd_fg/G.fst # Build ConstArpaLm for the unpruned language model. @@ -78,10 +72,8 @@ gunzip -c $lm_srcdir_4g/lm_unpruned.gz | \ --unk-symbol=$unk - data/lang${lang_suffix}_test_bd_fgconst/G.carpa || exit 1 gunzip -c $lm_srcdir_4g/lm_pr7.0.gz | \ - arpa2fst - | fstprint | \ - utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$lang/words.txt \ - --osymbols=$lang/words.txt --keep_isymbols=false --keep_osymbols=false | \ - fstrmepsilon | fstarcsort --sort_type=ilabel > data/lang${lang_suffix}_test_bd_fgpr/G.fst || exit 1; + arpa2fst --disambig-symbol=#0 \ + --read-symbol-table=$lang/words.txt - data/lang${lang_suffix}_test_bd_fgpr/G.fst || exit 1; fstisstochastic data/lang${lang_suffix}_test_bd_fgpr/G.fst exit 0; diff --git a/egs/wsj/s5/local/wsj_train_rnnlms.sh b/egs/wsj/s5/local/wsj_train_rnnlms.sh index 1d4fda63fe7..d1ba64c0a3c 100755 --- a/egs/wsj/s5/local/wsj_train_rnnlms.sh +++ b/egs/wsj/s5/local/wsj_train_rnnlms.sh @@ -38,36 +38,10 @@ dir=$1 srcdir=data/local/dict${dict_suffix}_larger mkdir -p $dir +$KALDI_ROOT/tools/extras/check_for_rnnlm.sh "$rnnlm_ver" || exit 1 export PATH=$KALDI_ROOT/tools/$rnnlm_ver:$PATH -( # First make sure the kaldi_lm toolkit is installed. - # Note: this didn't work out of the box for me, I had to - # change the g++ version to just "g++" (no cross-compilation - # needed for me as I ran on a machine that had been setup - # as 64 bit by default. - cd $KALDI_ROOT/tools || exit 1; - if [ -f $rnnlm_ver/rnnlm ]; then - echo Not installing the rnnlm toolkit since it is already there. - else - if [ $rnnlm_ver == "rnnlm-hs-0.1b" ]; then - extras/install_rnnlm_hs.sh - else - echo Downloading and installing the rnnlm tools - # http://www.fit.vutbr.cz/~imikolov/rnnlm/$rnnlm_ver.tgz - if [ ! -f $rnnlm_ver.tgz ]; then - wget http://www.fit.vutbr.cz/~imikolov/rnnlm/$rnnlm_ver.tgz || exit 1; - fi - mkdir $rnnlm_ver - cd $rnnlm_ver - tar -xvzf ../$rnnlm_ver.tgz || exit 1; - make CC=g++ || exit 1; - echo Done making the rnnlm tools - fi - fi -) || exit 1; - - if [ ! -f $srcdir/cleaned.gz -o ! -f $srcdir/lexicon.txt ]; then echo "Expecting files $srcdir/cleaned.gz and $srcdir/wordlist.final to exist"; echo "You need to run local/wsj_extend_dict.sh before running this script." diff --git a/egs/wsj/s5/path.sh b/egs/wsj/s5/path.sh index c85d79a7ce3..2d17b17a84a 100755 --- a/egs/wsj/s5/path.sh +++ b/egs/wsj/s5/path.sh @@ -1,4 +1,6 @@ export KALDI_ROOT=`pwd`/../../.. -[ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh -export PATH=$PWD/utils/:$KALDI_ROOT/src/bin:$KALDI_ROOT/tools/openfst/bin:$KALDI_ROOT/src/fstbin/:$KALDI_ROOT/src/gmmbin/:$KALDI_ROOT/src/featbin/:$KALDI_ROOT/src/lm/:$KALDI_ROOT/src/sgmmbin/:$KALDI_ROOT/src/sgmm2bin/:$KALDI_ROOT/src/fgmmbin/:$KALDI_ROOT/src/latbin/:$KALDI_ROOT/src/nnetbin:$KALDI_ROOT/src/nnet2bin/:$KALDI_ROOT/src/kwsbin:$KALDI_ROOT/src/online2bin/:$KALDI_ROOT/src/ivectorbin/:$KALDI_ROOT/src/lmbin/:$KALDI_ROOT/src/nnet3bin/:$PWD:$PATH +[ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh +export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH +[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1 +. $KALDI_ROOT/tools/config/common_path.sh export LC_ALL=C diff --git a/egs/wsj/s5/run.sh b/egs/wsj/s5/run.sh index 8630352cdd9..7660320dece 100755 --- a/egs/wsj/s5/run.sh +++ b/egs/wsj/s5/run.sh @@ -20,8 +20,8 @@ wsj1=/export/corpora5/LDC/LDC94S13B local/wsj_data_prep.sh $wsj0/??-{?,??}.? $wsj1/??-{?,??}.? || exit 1; -# Sometimes, we have seen WSJ distributions that do not have subdirectories -# like '11-13.1', but instead have 'doc', 'si_et_05', etc. directly under the +# Sometimes, we have seen WSJ distributions that do not have subdirectories +# like '11-13.1', but instead have 'doc', 'si_et_05', etc. directly under the # wsj0 or wsj1 directories. In such cases, try the following: # # corpus=/exports/work/inf_hcrc_cstr_general/corpora/wsj @@ -44,65 +44,23 @@ local/wsj_format_data.sh --lang-suffix "_nosp" || exit 1; # containing many of the OOVs in the WSJ LM training data, # and an LM trained directly on that data (i.e. not just # copying the arpa files from the disks from LDC). - # Caution: the commands below will only work if $decode_cmd + # Caution: the commands below will only work if $decode_cmd # is setup to use qsub. Else, just remove the --cmd option. - # NOTE: If you have a setup corresponding to the cstr_wsj_data_prep.sh style, - # use local/cstr_wsj_extend_dict.sh $corpus/wsj1/doc/ instead. - - # Note: I am commenting out the RNNLM-building commands below. They take up a lot - # of CPU time and are not really part of the "main recipe." - # Be careful: appending things like "--mem 10G" to $decode_cmd - # won't always work, it depends what $decode_cmd is. + # NOTE: If you have a setup corresponding to the older cstr_wsj_data_prep.sh style, + # use local/cstr_wsj_extend_dict.sh --dict-suffix "_nosp" $corpus/wsj1/doc/ instead. ( local/wsj_extend_dict.sh --dict-suffix "_nosp" $wsj1/13-32.1 && \ utils/prepare_lang.sh data/local/dict_nosp_larger \ "" data/local/lang_tmp_nosp_larger data/lang_nosp_bd && \ local/wsj_train_lms.sh --dict-suffix "_nosp" && local/wsj_format_local_lms.sh --lang-suffix "_nosp" # && - # - # ( local/wsj_train_rnnlms.sh --dict-suffix "_nosp" \ - # --cmd "$decode_cmd --mem 10G" data/local/rnnlm.h30.voc10k & - # sleep 20; # wait till tools compiled. - # local/wsj_train_rnnlms.sh --dict-suffix "_nosp" \ - # --cmd "$decode_cmd --mem 12G" \ - # --hidden 100 --nwords 20000 --class 350 \ - # --direct 1500 data/local/rnnlm.h100.voc20k & - # local/wsj_train_rnnlms.sh --dict-suffix "_nosp" \ - # --cmd "$decode_cmd --mem 14G" \ - # --hidden 200 --nwords 30000 --class 350 \ - # --direct 1500 data/local/rnnlm.h200.voc30k & - # local/wsj_train_rnnlms.sh --dict-suffix "_nosp" \ - # --cmd "$decode_cmd --mem 16G" \ - # --hidden 300 --nwords 40000 --class 400 \ - # --direct 2000 data/local/rnnlm.h300.voc40k & - # ) - false && \ # Comment this out to train RNNLM-HS - ( - num_threads_rnnlm=8 - local/wsj_train_rnnlms.sh --dict-suffix "_nosp" \ - --rnnlm_ver rnnlm-hs-0.1b --threads $num_threads_rnnlm \ - --cmd "$decode_cmd --mem 1G --num-threads $num_threads_rnnlm" --bptt 4 --bptt-block 10 \ - --hidden 30 --nwords 10000 --direct 1000 data/local/rnnlm-hs.h30.voc10k - local/wsj_train_rnnlms.sh --dict-suffix "_nosp" \ - --rnnlm_ver rnnlm-hs-0.1b --threads $num_threads_rnnlm \ - --cmd "$decode_cmd --mem 1G --num-threads $num_threads_rnnlm" --bptt 4 --bptt-block 10 \ - --hidden 100 --nwords 20000 --direct 1500 data/local/rnnlm-hs.h100.voc20k - local/wsj_train_rnnlms.sh --dict-suffix "_nosp" \ - --rnnlm_ver rnnlm-hs-0.1b --threads $num_threads_rnnlm \ - --cmd "$decode_cmd --mem 1G --num-threads $num_threads_rnnlm" --bptt 4 --bptt-block 10 \ - --hidden 300 --nwords 30000 --direct 1500 data/local/rnnlm-hs.h300.voc30k - local/wsj_train_rnnlms.sh --dict-suffix "_nosp" \ - --rnnlm_ver rnnlm-hs-0.1b --threads $num_threads_rnnlm \ - --cmd "$decode_cmd --mem 1G --num-threads $num_threads_rnnlm" --bptt 4 --bptt-block 10 \ - --hidden 400 --nwords 40000 --direct 2000 data/local/rnnlm-hs.h400.voc40k - ) ) & # Now make MFCC features. # mfccdir should be some place with a largish disk where you # want to store MFCC features. mfccdir=mfcc -for x in test_eval92 test_eval93 test_dev93 train_si284; do +for x in test_eval92 test_eval93 test_dev93 train_si284; do steps/make_mfcc.sh --cmd "$train_cmd" --nj 20 \ data/$x exp/make_mfcc/$x $mfccdir || exit 1; steps/compute_cmvn_stats.sh data/$x exp/make_mfcc/$x $mfccdir || exit 1; @@ -129,7 +87,7 @@ steps/train_mono.sh --boost-silence 1.25 --nj 10 --cmd "$train_cmd" \ steps/decode.sh --nj 10 --cmd "$decode_cmd" exp/mono0a/graph_nosp_tgpr \ data/test_dev93 exp/mono0a/decode_nosp_tgpr_dev93 && \ steps/decode.sh --nj 8 --cmd "$decode_cmd" exp/mono0a/graph_nosp_tgpr \ - data/test_eval92 exp/mono0a/decode_nosp_tgpr_eval92 + data/test_eval92 exp/mono0a/decode_nosp_tgpr_eval92 ) & steps/align_si.sh --boost-silence 1.25 --nj 10 --cmd "$train_cmd" \ @@ -143,7 +101,7 @@ while [ ! -f data/lang_nosp_test_tgpr/tmp/LG.fst ] || \ sleep 20; done sleep 30; -# or the mono mkgraph.sh might be writing +# or the mono mkgraph.sh might be writing # data/lang_test_tgpr/tmp/LG.fst which will cause this to fail. utils/mkgraph.sh data/lang_nosp_test_tgpr \ @@ -222,9 +180,9 @@ steps/lmrescore.sh --cmd "$decode_cmd" \ exp/tri2b/decode_nosp_tgpr_dev93_tg || exit 1; # Trying Minimum Bayes Risk decoding (like Confusion Network decoding): -mkdir exp/tri2b/decode_nosp_tgpr_dev93_tg_mbr +mkdir exp/tri2b/decode_nosp_tgpr_dev93_tg_mbr cp exp/tri2b/decode_nosp_tgpr_dev93_tg/lat.*.gz \ - exp/tri2b/decode_nosp_tgpr_dev93_tg_mbr + exp/tri2b/decode_nosp_tgpr_dev93_tg_mbr local/score_mbr.sh --cmd "$decode_cmd" \ data/test_dev93/ data/lang_nosp_test_tgpr/ \ exp/tri2b/decode_nosp_tgpr_dev93_tg_mbr @@ -291,14 +249,6 @@ steps/lmrescore.sh --cmd "$decode_cmd" \ data/test_eval92 exp/tri3b/decode_nosp_bd_tgpr_eval92 \ exp/tri3b/decode_nosp_bd_tgpr_eval92_tg || exit 1; -# The command below is commented out as we commented out the steps above -# that build the RNNLMs, so it would fail. -# local/run_rnnlms_tri3b.sh --lang-suffix "_nosp" - -# The command below is commented out as we commented out the steps above -# that build the RNNLMs (HS version), so it would fail. -# wait; local/run_rnnlm-hs_tri3b.sh --lang-suffix "_nosp" - # The following two steps, which are a kind of side-branch, try mixing up ( # from the 3b system. This is to demonstrate that script. steps/mixup.sh --cmd "$train_cmd" \ @@ -326,7 +276,7 @@ steps/train_sat.sh --cmd "$train_cmd" 4200 40000 \ steps/decode_fmllr.sh --nj 8 --cmd "$decode_cmd" \ exp/tri4a/graph_nosp_tgpr data/test_eval92 \ exp/tri4a/decode_nosp_tgpr_eval92 || exit 1; -) & +) & # This step is just to demonstrate the train_quick.sh script, in which we @@ -417,6 +367,10 @@ local/online/run_nnet2.sh local/online/run_nnet2_baseline.sh local/online/run_nnet2_discriminative.sh +# Demonstration of RNNLM rescoring on TDNN models. We comment this out by +# default. +# local/run_rnnlms.sh + local/run_mmi_tri4b.sh #local/run_nnet2.sh @@ -443,7 +397,7 @@ local/nnet/run_dnn.sh #local/run_bnf_sgmm.sh -# You probably want to try KL-HMM +# You probably want to try KL-HMM #local/run_kl_hmm.sh # Getting results [see RESULTS file] @@ -474,9 +428,20 @@ local/nnet/run_dnn.sh # - exp/tri4b/decode_bd_tgpr_eval92/kws/kwslist.xml # # forward-backward decoding example [way to speed up decoding by decoding forward -# # and backward in time] +# # and backward in time] # local/run_fwdbwd.sh # # A couple of nnet3 recipes: # local/nnet3/run_tdnn_baseline.sh # designed for exact comparison with nnet2 recipe -# local/nnet3/run_tdnn.sh # better absolute results \ No newline at end of file +# local/nnet3/run_tdnn.sh # better absolute results +# local/nnet3/run_lstm.sh # lstm recipe +# bidirectional lstm recipe +# local/nnet3/run_lstm.sh --affix bidirectional \ +# --lstm-delay " [-1,1] [-2,2] [-3,3] " \ +# --label-delay 0 \ +# --cell-dim 640 \ +# --recurrent-projection-dim 128 \ +# --non-recurrent-projection-dim 128 \ +# --chunk-left-context 40 \ +# --chunk-right-context 40 + diff --git a/egs/wsj/s5/steps/align_basis_fmllr.sh b/egs/wsj/s5/steps/align_basis_fmllr.sh index 0f195c6e88f..b3a2107a086 100755 --- a/egs/wsj/s5/steps/align_basis_fmllr.sh +++ b/egs/wsj/s5/steps/align_basis_fmllr.sh @@ -63,12 +63,14 @@ splice_opts=`cat $srcdir/splice_opts 2>/dev/null` # frame-splicing options. cp $srcdir/splice_opts $dir 2>/dev/null # frame-splicing options. cmvn_opts=`cat $srcdir/cmvn_opts 2>/dev/null` cp $srcdir/cmvn_opts $dir 2>/dev/null # cmn/cmvn option. +delta_opts=`cat $srcdir/delta_opts 2>/dev/null` +cp $srcdir/delta_opts $dir 2>/dev/null if [ -f $srcdir/final.mat ]; then feat_type=lda; else feat_type=delta; fi echo "$0: feature type is $feat_type" case $feat_type in - delta) sifeats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";; + delta) sifeats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas $delta_opts ark:- ark:- |";; lda) sifeats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |" cp $srcdir/final.mat $dir ;; diff --git a/egs/wsj/s5/steps/align_fmllr_lats.sh b/egs/wsj/s5/steps/align_fmllr_lats.sh new file mode 100755 index 00000000000..12c2a6479e4 --- /dev/null +++ b/egs/wsj/s5/steps/align_fmllr_lats.sh @@ -0,0 +1,160 @@ +#!/bin/bash +# +# Copyright 2012-2015 Johns Hopkins University (Author: Daniel Povey) +# Apache 2.0 + +# Version of align_fmllr.sh that generates lattices (lat.*.gz) with +# alignments of alternative pronunciations in them. Mainly intended +# as a precursor to CTC training for now. + +# Begin configuration section. +stage=0 +nj=4 +cmd=run.pl +# Begin configuration. +scale_opts="--transition-scale=1.0 --self-loop-scale=0.1" +acoustic_scale=0.1 +beam=10 +retry_beam=40 +final_beam=20 # For the lattice-generation phase there is no retry-beam. This + # is a limitation of gmm-latgen-faster. We just use an + # intermediate beam. We'll lose a little data and it will be + # slightly slower. (however, the min-active of 200 that + # gmm-latgen-faster defaults to may help.) +boost_silence=1.0 # factor by which to boost silence during alignment. +fmllr_update_type=full +# End configuration options. + +echo "$0 $@" # Print the command line for logging + +[ -f path.sh ] && . ./path.sh # source the path. +. parse_options.sh || exit 1; + +if [ $# != 4 ]; then + echo "usage: steps/align_fmllr_lats.sh " + echo "e.g.: steps/align_fmllr_lats.sh data/train data/lang exp/tri1 exp/tri1_lats" + echo "main options (for others, see top of script file)" + echo " --config # config containing options" + echo " --nj # number of parallel jobs" + echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs." + echo " --fmllr-update-type (full|diag|offset|none) # default full." + exit 1; +fi + +data=$1 +lang=$2 +srcdir=$3 +dir=$4 + +oov=`cat $lang/oov.int` || exit 1; +silphonelist=`cat $lang/phones/silence.csl` || exit 1; +sdata=$data/split$nj + +mkdir -p $dir/log +echo $nj > $dir/num_jobs +[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1; + +cp $srcdir/{tree,final.mdl} $dir || exit 1; +cp $srcdir/final.alimdl $dir 2>/dev/null +cp $srcdir/final.occs $dir; +splice_opts=`cat $srcdir/splice_opts 2>/dev/null` # frame-splicing options. +cp $srcdir/splice_opts $dir 2>/dev/null # frame-splicing options. +cmvn_opts=`cat $srcdir/cmvn_opts 2>/dev/null` +cp $srcdir/cmvn_opts $dir 2>/dev/null # cmn/cmvn option. +delta_opts=`cat $srcdir/delta_opts 2>/dev/null` +cp $srcdir/delta_opts $dir 2>/dev/null + +if [ -f $srcdir/final.mat ]; then feat_type=lda; else feat_type=delta; fi +echo "$0: feature type is $feat_type" + +case $feat_type in + delta) sifeats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas $delta_opts ark:- ark:- |";; + lda) sifeats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |" + cp $srcdir/final.mat $dir + cp $srcdir/full.mat $dir 2>/dev/null + ;; + *) echo "Invalid feature type $feat_type" && exit 1; +esac + +## Set up model and alignment model. +mdl=$srcdir/final.mdl +if [ -f $srcdir/final.alimdl ]; then + alimdl=$srcdir/final.alimdl +else + alimdl=$srcdir/final.mdl +fi +[ ! -f $mdl ] && echo "$0: no such model $mdl" && exit 1; +alimdl_cmd="gmm-boost-silence --boost=$boost_silence `cat $lang/phones/optional_silence.csl` $alimdl - |" +mdl_cmd="gmm-boost-silence --boost=$boost_silence `cat $lang/phones/optional_silence.csl` $mdl - |" + + +## because gmm-latgen-faster doesn't support adding the transition-probs to the +## graph itself, we need to bake them into the compiled graphs. This means we can't reuse previously compiled graphs, +## because the other scripts write them without transition probs. +if [ $stage -le 0 ]; then + echo "$0: compiling training graphs" + tra="ark:utils/sym2int.pl --map-oov $oov -f 2- $lang/words.txt $sdata/JOB/text|"; + $cmd JOB=1:$nj $dir/log/compile_graphs.JOB.log \ + compile-train-graphs $scale_opts $dir/tree $dir/final.mdl $lang/L.fst "$tra" \ + "ark:|gzip -c >$dir/fsts.JOB.gz" || exit 1; +fi + + +if [ $stage -le 1 ]; then + # Note: we need to set --transition-scale=0.0 --self-loop-scale=0.0 because, + # as explained above, we compiled the transition probs into the training + # graphs. + echo "$0: aligning data in $data using $alimdl and speaker-independent features." + $cmd JOB=1:$nj $dir/log/align_pass1.JOB.log \ + gmm-align-compiled --transition-scale=0.0 --self-loop-scale=0.0 --acoustic-scale=$acoustic_scale \ + --beam=$beam --retry-beam=$retry_beam "$alimdl_cmd" \ + "ark:gunzip -c $dir/fsts.JOB.gz|" "$sifeats" "ark:|gzip -c >$dir/pre_ali.JOB.gz" || exit 1; +fi + +if [ $stage -le 2 ]; then + echo "$0: computing fMLLR transforms" + if [ "$alimdl" != "$mdl" ]; then + $cmd JOB=1:$nj $dir/log/fmllr.JOB.log \ + ali-to-post "ark:gunzip -c $dir/pre_ali.JOB.gz|" ark:- \| \ + weight-silence-post 0.0 $silphonelist $alimdl ark:- ark:- \| \ + gmm-post-to-gpost $alimdl "$sifeats" ark:- ark:- \| \ + gmm-est-fmllr-gpost --fmllr-update-type=$fmllr_update_type \ + --spk2utt=ark:$sdata/JOB/spk2utt $mdl "$sifeats" \ + ark,s,cs:- ark:$dir/trans.JOB || exit 1; + else + $cmd JOB=1:$nj $dir/log/fmllr.JOB.log \ + ali-to-post "ark:gunzip -c $dir/pre_ali.JOB.gz|" ark:- \| \ + weight-silence-post 0.0 $silphonelist $alimdl ark:- ark:- \| \ + gmm-est-fmllr --fmllr-update-type=$fmllr_update_type \ + --spk2utt=ark:$sdata/JOB/spk2utt $mdl "$sifeats" \ + ark,s,cs:- ark:$dir/trans.JOB || exit 1; + fi +fi + +feats="$sifeats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$dir/trans.JOB ark:- ark:- |" + +if [ $stage -le 3 ]; then + # Warning: gmm-latgen-faster doesn't support a retry-beam so you may get more + # alignment errors (however, it does have a default min-active=200 so this + # will tend to reduce alignment errors). + # --allow_partial=false makes sure we reach the end of the decoding graph. + # --word-determinize=false makes sure we retain the alternative pronunciations of + # words (including alternatives regarding optional silences). + # --lattice-beam=$beam keeps all the alternatives that were within the beam, + # it means we do no pruning of the lattice (lattices from a training transcription + # will be small anyway). + echo "$0: generating lattices containing alternate pronunciations." + $cmd JOB=1:$nj $dir/log/generate_lattices.JOB.log \ + gmm-latgen-faster --acoustic-scale=$acoustic_scale --beam=$final_beam \ + --lattice-beam=$final_beam --allow-partial=false --word-determinize=false \ + "$mdl_cmd" "ark:gunzip -c $dir/fsts.JOB.gz|" "$feats" \ + "ark:|gzip -c >$dir/lat.JOB.gz" || exit 1; +fi + +rm $dir/pre_ali.*.gz + +echo "$0: done generating lattices from training transcripts." + +utils/summarize_warnings.pl $dir/log + +exit 0; diff --git a/egs/wsj/s5/steps/align_si.sh b/egs/wsj/s5/steps/align_si.sh index 2e3e9496d5d..ff53c773819 100755 --- a/egs/wsj/s5/steps/align_si.sh +++ b/egs/wsj/s5/steps/align_si.sh @@ -56,6 +56,8 @@ splice_opts=`cat $srcdir/splice_opts 2>/dev/null` # frame-splicing options. cp $srcdir/splice_opts $dir 2>/dev/null # frame-splicing options. cmvn_opts=`cat $srcdir/cmvn_opts 2>/dev/null` cp $srcdir/cmvn_opts $dir 2>/dev/null # cmn/cmvn option. +delta_opts=`cat $srcdir/delta_opts 2>/dev/null` +cp $srcdir/delta_opts $dir 2>/dev/null [[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1; @@ -68,7 +70,7 @@ if [ -f $srcdir/final.mat ]; then feat_type=lda; else feat_type=delta; fi echo "$0: feature type is $feat_type" case $feat_type in - delta) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";; + delta) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas $delta_opts ark:- ark:- |";; lda) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |" cp $srcdir/final.mat $srcdir/full.mat $dir ;; diff --git a/egs/wsj/s5/steps/cleanup/combine_short_segments.py b/egs/wsj/s5/steps/cleanup/combine_short_segments.py new file mode 100755 index 00000000000..f51da6afa25 --- /dev/null +++ b/egs/wsj/s5/steps/cleanup/combine_short_segments.py @@ -0,0 +1,312 @@ +#!/usr/bin/env python + +# Copyright 2016 Vijayaditya Peddinti +# Apache 2.0 + +import argparse +import sys +import os +import subprocess +import errno +import copy +import shutil + +def GetArgs(): + # we add compulsary arguments as named arguments for readability + parser = argparse.ArgumentParser(description=""" + This script concatenates segments in the input_data_dir to ensure that""" + " the segments in the output_data_dir have a specified minimum length.", + formatter_class=argparse.ArgumentDefaultsHelpFormatter) + + parser.add_argument("--minimum-duration", type=float, required = True, + help="Minimum duration of the segments in the output directory") + parser.add_argument("--input-data-dir", type=str, required = True) + parser.add_argument("--output-data-dir", type=str, required = True) + + print(' '.join(sys.argv)) + args = parser.parse_args() + return args + +def RunKaldiCommand(command, wait = True): + """ Runs commands frequently seen in Kaldi scripts. These are usually a + sequence of commands connected by pipes, so we use shell=True """ + p = subprocess.Popen(command, shell = True, + stdout = subprocess.PIPE, + stderr = subprocess.PIPE) + + if wait: + [stdout, stderr] = p.communicate() + if p.returncode is not 0: + raise Exception("There was an error while running the command {0}\n".format(command)+"-"*10+"\n"+stderr) + return stdout, stderr + else: + return p + +def MakeDir(dir): + try: + os.mkdir(dir) + except OSError as exc: + if exc.errno != errno.EEXIST: + raise exc + raise Exception("Directory {0} already exists".format(dir)) + pass + +def CheckFiles(input_data_dir): + for file_name in ['spk2utt', 'text', 'utt2spk', 'feats.scp']: + file_name = '{0}/{1}'.format(input_data_dir, file_name) + if not os.path.exists(file_name): + raise Exception("There is no such file {0}".format(file_name)) + +def ParseFileToDict(file, assert2fields = False, value_processor = None): + if value_processor is None: + value_processor = lambda x: x[0] + + dict = {} + for line in open(file, 'r'): + parts = line.split() + if assert2fields: + assert(len(parts) == 2) + + dict[parts[0]] = value_processor(parts[1:]) + return dict + +def WriteDictToFile(dict, file_name): + file = open(file_name, 'w') + keys = dict.keys() + keys.sort() + for key in keys: + value = dict[key] + if type(value) in [list, tuple] : + if type(value) is tuple: + value = list(value) + value.sort() + value = ' '.join(value) + file.write('{0}\t{1}\n'.format(key, value)) + file.close() + + +def ParseDataDirInfo(data_dir): + data_dir_file = lambda file_name: '{0}/{1}'.format(data_dir, file_name) + + utt2spk = ParseFileToDict(data_dir_file('utt2spk')) + spk2utt = ParseFileToDict(data_dir_file('spk2utt'), value_processor = lambda x: x) + text = ParseFileToDict(data_dir_file('text'), value_processor = lambda x: " ".join(x)) + # we want to assert feats.scp has just 2 fields, as we don't know how + # to process it otherwise + feat = ParseFileToDict(data_dir_file('feats.scp'), assert2fields = True) + utt2dur = ParseFileToDict(data_dir_file('utt2dur'), value_processor = lambda x: float(x[0])) + utt2uniq = None + if os.path.exists(data_dir_file('utt2uniq')): + utt2uniq = ParseFileToDict(data_dir_file('utt2uniq')) + return utt2spk, spk2utt, text, feat, utt2dur, utt2uniq + + +def GetCombinedUttIndexRange(utt_index, utts, utt_durs, minimum_duration): + # We want the minimum number of concatenations + # to reach the minimum_duration. If two concatenations satisfy + # the minimum duration constraint we choose the shorter one. + left_index = utt_index - 1 + right_index = utt_index + 1 + num_remaining_segments = len(utts) - 1 + cur_utt_dur = utt_durs[utts[utt_index]] + + while num_remaining_segments > 0: + + left_utt_dur = 0 + if left_index >= 0: + left_utt_dur = utt_durs[utts[left_index]] + right_utt_dur = 0 + if right_index <= len(utts) - 1: + right_utt_dur = utt_durs[utts[right_index]] + + right_combined_utt_dur = cur_utt_dur + right_utt_dur + left_combined_utt_dur = cur_utt_dur + left_utt_dur + left_right_combined_utt_dur = cur_utt_dur + left_utt_dur + right_utt_dur + + combine_left_exit = False + combine_right_exit = False + if right_combined_utt_dur >= minimum_duration: + if left_combined_utt_dur >= minimum_duration: + if left_combined_utt_dur <= right_combined_utt_dur: + combine_left_exit = True + else: + combine_right_exit = True + else: + combine_right_exit = True + elif left_combined_utt_dur >= minimum_duration: + combine_left_exit = True + elif left_right_combined_utt_dur >= minimum_duration : + combine_left_exit = True + combine_right_exit = True + + if combine_left_exit and combine_right_exit: + cur_utt_dur = left_right_combined_utt_dur + break + elif combine_left_exit: + cur_utt_dur = left_combined_utt_dur + # move back the right_index as we don't need to combine it + right_index = right_index - 1 + break + elif combine_right_exit: + cur_utt_dur = right_combined_utt_dur + # move back the left_index as we don't need to combine it + left_index = left_index + 1 + break + + # couldn't satisfy minimum duration requirement so continue search + if left_index >= 0: + num_remaining_segments = num_remaining_segments - 1 + if right_index <= len(utts) - 1: + num_remaining_segments = num_remaining_segments - 1 + + left_index = left_index - 1 + right_index = right_index + 1 + + cur_utt_dur = left_right_combined_utt_dur + left_index = max(0, left_index) + right_index = min(len(utts)-1, right_index) + return left_index, right_index, cur_utt_dur + + +def WriteCombinedDirFiles(output_dir, utt2spk, spk2utt, text, feat, utt2dur, utt2uniq): + out_dir_file = lambda file_name: '{0}/{1}'.format(output_dir, file_name) + total_combined_utt_list = [] + for speaker in spk2utt.keys(): + utts = spk2utt[speaker] + for utt in utts: + if type(utt) is tuple: + #this is a combined utt + total_combined_utt_list.append((speaker, utt)) + + for speaker, combined_utt_tuple in total_combined_utt_list: + combined_utt_list = list(combined_utt_tuple) + combined_utt_list.sort() + new_utt_name = "-".join(combined_utt_list)+'-appended' + + # updating the utt2spk dict + for utt in combined_utt_list: + spk_name = utt2spk.pop(utt) + utt2spk[new_utt_name] = spk_name + + # updating the spk2utt dict + spk2utt[speaker].remove(combined_utt_tuple) + spk2utt[speaker].append(new_utt_name) + + # updating the text dict + combined_text = [] + for utt in combined_utt_list: + combined_text.append(text.pop(utt)) + text[new_utt_name] = ' '.join(combined_text) + + # updating the feat dict + combined_feat = [] + for utt in combined_utt_list: + combined_feat.append(feat.pop(utt)) + feat_command = "concat-feats --print-args=false {feats} - |".format(feats = " ".join(combined_feat)) + feat[new_utt_name] = feat_command + + # updating utt2dur + combined_dur = 0 + for utt in combined_utt_list: + combined_dur += utt2dur.pop(utt) + utt2dur[new_utt_name] = combined_dur + + # updating utt2uniq + if utt2uniq is not None: + combined_uniqs = [] + for utt in combined_utt_list: + combined_uniqs.append(utt2uniq.pop(utt)) + # utt2uniq file is used to map perturbed data to original unperturbed + # versions so that the training cross validation sets can avoid overlap + # of data however if perturbation changes the length of the utterance + # (e.g. speed perturbation) the utterance combinations in each + # perturbation of the original recording can be very different. So there + # is no good way to find the utt2uniq mappinng so that we can avoid + # overlap. + utt2uniq[new_utt_name] = combined_uniqs[0] + + + WriteDictToFile(utt2spk, out_dir_file('utt2spk')) + WriteDictToFile(spk2utt, out_dir_file('spk2utt')) + WriteDictToFile(feat, out_dir_file('feats.scp')) + WriteDictToFile(text, out_dir_file('text')) + if utt2uniq is not None: + WriteDictToFile(utt2uniq, out_dir_file('utt2uniq')) + WriteDictToFile(utt2dur, out_dir_file('utt2dur')) + + +def CombineSegments(input_dir, output_dir, minimum_duration): + utt2spk, spk2utt, text, feat, utt2dur, utt2uniq = ParseDataDirInfo(input_dir) + total_combined_utt_list = [] + + # copy the duration dictionary so that we can modify it + utt_durs = copy.deepcopy(utt2dur) + speakers = spk2utt.keys() + speakers.sort() + for speaker in speakers: + + utts = spk2utt[speaker] # this is an assignment of the reference + # In WriteCombinedDirFiles the values of spk2utt will have the list + # of combined utts which will be used as reference + + # we make an assumption that the sorted uttlist corresponds + # to contiguous segments. This is true only if utt naming + # is done according to accepted conventions + # this is an easily violatable assumption. Have to think of a better + # way to do this. + utts.sort() + utt_index = 0 + while utt_index < len(utts): + if utt_durs[utts[utt_index]] < minimum_duration: + left_index, right_index, cur_utt_dur = GetCombinedUttIndexRange(utt_index, utts, utt_durs, minimum_duration) + if not cur_utt_dur >= minimum_duration: + # this is a rare occurrence, better make the user aware of this + # situation and let them deal with it + raise Exception('Speaker {0} does not have enough utterances to satisfy the minimum duration constraint'.format(speaker)) + + combined_duration = 0 + combined_utts = [] + # update the utts_dur dictionary + for utt in utts[left_index:right_index + 1]: + combined_duration += utt_durs.pop(utt) + if type(utt) is tuple: + for item in utt: + combined_utts.append(item) + else: + combined_utts.append(utt) + combined_utts = tuple(combined_utts) # converting to immutable type to use as dictionary key + assert(cur_utt_dur == combined_duration) + + # now modify the utts list + combined_indices = range(left_index, right_index + 1) + # start popping from the largest index so that the lower + # indexes are valid + for i in combined_indices[::-1]: + utts.pop(i) + utts.insert(left_index, combined_utts) + utt_durs[combined_utts] = combined_duration + utt_index = left_index + utt_index = utt_index + 1 + WriteCombinedDirFiles(output_dir, utt2spk, spk2utt, text, feat, utt2dur, utt2uniq) + +def Main(): + args = GetArgs() + + CheckFiles(args.input_data_dir) + MakeDir(args.output_data_dir) + feat_lengths = {} + segments_file = '{0}/segments'.format(args.input_data_dir) + + RunKaldiCommand("utils/data/get_utt2dur.sh {0}".format(args.input_data_dir)) + + CombineSegments(args.input_data_dir, args.output_data_dir, args.minimum_duration) + + RunKaldiCommand("utils/utt2spk_to_spk2utt.pl {od}/utt2spk > {od}/spk2utt".format(od = args.output_data_dir)) + if os.path.exists('{0}/cmvn.scp'.format(args.input_data_dir)): + shutil.copy('{0}/cmvn.scp'.format(args.input_data_dir), args.output_data_dir) + + RunKaldiCommand("utils/fix_data_dir.sh {0}".format(args.output_data_dir)) +if __name__ == "__main__": + Main() + + diff --git a/egs/wsj/s5/steps/cleanup/create_segments_from_ctm.pl b/egs/wsj/s5/steps/cleanup/create_segments_from_ctm.pl index 911640f5495..5af5fd34662 100755 --- a/egs/wsj/s5/steps/cleanup/create_segments_from_ctm.pl +++ b/egs/wsj/s5/steps/cleanup/create_segments_from_ctm.pl @@ -42,11 +42,11 @@ (default = "<***>") --wer-cutoff : Ignore segments with WER higher than the specified value. -1 means no segment will be ignored. (default = -1) - --use-silence-midpoints : Set to 1 if you want to use silence midpoints + --use-silence-midpoints : Set to 1 if you want to use silence midpoints instead of min_sil_length for silence overhang.(default 0) - --force-correct-boundary-words : Set to zero if the segments will not be + --force-correct-boundary-words : Set to zero if the segments will not be required to have boundary words to be correct. Default 1 - --aligned-ctm-filename : If set, the intermediate aligned ctm + --aligned-ctm-filename : If set, the intermediate aligned ctm is saved to this file EOU @@ -56,7 +56,7 @@ my $separator = ";"; my $special_symbol = "<***>"; my $wer_cutoff = -1; -my $use_silence_midpoints = 0; +my $use_silence_midpoints = 0; my $force_correct_boundary_words = 1; my $aligned_ctm_filename = ""; GetOptions( @@ -122,13 +122,13 @@ sub PrintSegment { # Works out the surrounding silence. my $index = $seg_start_index - 1; - while ($index >= 0 && $aligned_ctm->[$index]->[0] eq + while ($index >= 0 && $aligned_ctm->[$index]->[0] eq "" && $aligned_ctm->[$index]->[3] == 0) { $index -= 1; } - my $left_of_segment_has_deletion = "false"; - $left_of_segment_has_deletion = "true" - if ($index > 0 && $aligned_ctm->[$index-1]->[0] ne "" + my $left_of_segment_has_deletion = "false"; + $left_of_segment_has_deletion = "true" + if ($index > 0 && $aligned_ctm->[$index-1]->[0] ne "" && $aligned_ctm->[$index-1]->[3] == 0); my $pad_start_sil = ($aligned_ctm->[$seg_start_index]->[1] - @@ -141,11 +141,11 @@ sub PrintSegment { my $right_of_segment_has_deletion = "false"; $index = $seg_end_index + 1; while ($index < scalar(@{$aligned_ctm}) && - $aligned_ctm->[$index]->[0] eq "" && + $aligned_ctm->[$index]->[0] eq "" && $aligned_ctm->[$index]->[3] == 0) { $index += 1; } - $right_of_segment_has_deletion = "true" + $right_of_segment_has_deletion = "true" if ($index < scalar(@{$aligned_ctm})-1 && $aligned_ctm->[$index+1]->[0] ne "" && $aligned_ctm->[$index - 1]->[3] > 0); my $pad_end_sil = ($aligned_ctm->[$index - 1]->[1] + @@ -155,7 +155,7 @@ sub PrintSegment { if (($right_of_segment_has_deletion eq "true") || !$use_silence_midpoints) { if ($pad_end_sil > $min_sil_length / 2.0) { $pad_end_sil = $min_sil_length / 2.0; - } + } } my $seg_start = $aligned_ctm->[$seg_start_index]->[1] - $pad_start_sil; @@ -228,7 +228,8 @@ sub SplitLongSegment { $aligned_ctm->[$seg_end_index]->[2] - $aligned_ctm->[$seg_start_index]->[1]; my $current_seg_index = $seg_start_index; - while ($current_seg_length > 1.5 * $max_seg_length) { + my $aligned_ctm_size = keys($aligned_ctm); + while ($current_seg_length > 1.5 * $max_seg_length && $current_seg_index < $aligned_ctm_size-1) { my $split_point = GetSplitPoint($aligned_ctm, $current_seg_index, $seg_end_index, $max_seg_length); my $ans = PrintSegment($aligned_ctm, $wav_id, $min_sil_length, @@ -241,6 +242,14 @@ sub SplitLongSegment { $aligned_ctm->[$current_seg_index]->[1]; } + if ($current_seg_index eq $aligned_ctm_size-1) { + my $ans = PrintSegment($aligned_ctm, $wav_id, $min_sil_length, + $min_seg_length, $current_seg_index, $current_seg_index, + $current_seg_count, $SO, $TO); + $current_seg_count += 1 if ($ans != -1); + return ($current_seg_count, $current_seg_index); + } + if ($current_seg_length > $max_seg_length) { my $split_point = GetSplitPoint($aligned_ctm, $current_seg_index, $seg_end_index, @@ -269,6 +278,7 @@ sub ProcessWav { $current_ctm, $current_align, $SO, $TO, $ACT) = @_; my $wav_id = $current_ctm->[0]->[0]; + my $channel_id = $current_ctm->[0]->[1]; defined($wav_id) || die "Error: empty wav section\n"; # First, we have to align the ctm file to the Levenshtein alignment. @@ -308,7 +318,7 @@ sub ProcessWav { $aligned_ctm[-1]->[3] += 1; } else { push(@aligned_ctm, ["", $start, $dur, 1]); - } + } } else { # Case 2.3: substitution. push(@aligned_ctm, [$ref_word, $start, $dur, 1]); @@ -322,10 +332,10 @@ sub ProcessWav { } # Save the aligned CTM if needed - if(tell($ACT) != -1){ - for (my $i=0; $i<=$#aligned_ctm; $i++) { - print $ACT "$aligned_ctm[$i][0] $aligned_ctm[$i][1] "; - print $ACT "$aligned_ctm[$i][2] $aligned_ctm[$i][3]\n"; + if(defined($ACT)){ + for (my $i = 0; $i <= $#aligned_ctm; $i++) { + print $ACT "$wav_id $channel_id $aligned_ctm[$i][1] $aligned_ctm[$i][2] "; + print $ACT "$aligned_ctm[$i][0] $aligned_ctm[$i][3]\n"; } } @@ -345,8 +355,8 @@ sub ProcessWav { # length, and if there are no alignment error around it. We also make sure # that segment contains actual words, instead of pure silence. if ($aligned_ctm[$x]->[0] eq "" && - $aligned_ctm[$x]->[2] >= $min_sil_length - && (($force_correct_boundary_words && $lcorrect eq "true" && + $aligned_ctm[$x]->[2] >= $min_sil_length + && (($force_correct_boundary_words && $lcorrect eq "true" && $rcorrect eq "true") || !$force_correct_boundary_words)) { if ($current_seg_length <= $max_seg_length && $current_seg_length >= $min_seg_length) { @@ -378,7 +388,7 @@ sub ProcessWav { # 011 A 3.39 0.23 SELL # 011 A 3.62 0.18 OFF # 011 A 3.83 0.45 ASSETS -# +# # Output ctm: # 011 A 3.39 0.23 SELL # 011 A 3.62 0.18 OFF @@ -391,7 +401,7 @@ sub InsertSilence { my $new_start = sprintf("%.2f", $ctm_in->[$x - 1]->[2] + $ctm_in->[$x - 1]->[3]); - if ($new_start <= $ctm_in->[$x]->[2]) { + if ($new_start < $ctm_in->[$x]->[2]) { my $new_dur = sprintf("%.2f", $ctm_in->[$x]->[2] - $new_start); push(@{$ctm_out}, [$ctm_in->[$x - 1]->[0], $ctm_in->[$x - 1]->[1], $new_start, $new_dur, ""]); @@ -458,4 +468,4 @@ sub InsertSilence { close(AI); close($SO); close($TO); -close($ACT); +close($ACT) if defined($ACT); diff --git a/egs/wsj/s5/steps/cleanup/debug_lexicon.sh b/egs/wsj/s5/steps/cleanup/debug_lexicon.sh index c768d89b44e..cdf1ff3e5df 100755 --- a/egs/wsj/s5/steps/cleanup/debug_lexicon.sh +++ b/egs/wsj/s5/steps/cleanup/debug_lexicon.sh @@ -4,11 +4,12 @@ # this script gets some stats that will help you debug the lexicon. -# Begin configuration section. +# Begin configuration section. stage=1 remove_stress=false nj=10 # number of jobs for various decoding-type things that we run. cmd=run.pl +alidir= # End configuration section echo "$0 $@" # Print the command line for logging @@ -26,6 +27,8 @@ if [ $# != 5 ]; then echo " --remove-stress # if true, remove stress before printing analysis" echo " # note: if you change this, you only have to rerun" echo " # from stage 10." + echo " --alidir # if supplied, training-data alignments and transforms" + echo " # are obtained from here instead of being generated." exit 1; fi @@ -41,38 +44,46 @@ for f in $data/feats.scp $lang/phones.txt $src/final.mdl $srcdict; do [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1; done -if [ $stage -le 1 ]; then - steps/align_fmllr.sh --cmd "$cmd" --nj $nj $data $lang $src ${src}_ali_$(basename $data) +if [ -z $alidir ]; then + alidir=${src}_ali_$(basename $data) + if [ $stage -le 1 ]; then + steps/align_fmllr.sh --cmd "$cmd" --nj $nj $data $lang $src $alidir + fi fi +phone_lang=data/$(basename $lang)_phone_bg + if [ $stage -le 2 ]; then - utils/make_phone_bigram_lang.sh $lang ${src}_ali_$(basename $data) data/$(basename $lang)_phone_bg + utils/make_phone_bigram_lang.sh $lang $alidir $phone_lang fi if [ $stage -le 3 ]; then - utils/mkgraph.sh data/$(basename $lang)_phone_bg $src $src/graph_phone_bg + utils/mkgraph.sh $phone_lang $src $src/graph_phone_bg fi if [ $stage -le 4 ]; then - steps/decode_si.sh --cmd "$cmd" --nj $nj --transform-dir ${src}_ali_$(basename $data) \ - --acwt 0.25 --beam 25.0 --lattice-beam 5.0 --max-active 2500 \ + steps/decode_si.sh --skip-scoring true \ + --cmd "$cmd" --nj $nj --transform-dir $alidir \ + --acwt 0.25 --beam 10.0 --lattice-beam 5.0 --max-active 2500 \ $src/graph_phone_bg $data $src/decode_$(basename $data)_phone_bg fi if [ $stage -le 5 ]; then - steps/get_train_ctm.sh $data $lang ${src}_ali_$(basename $data) + steps/get_train_ctm.sh --print-silence true --use-segments false \ + --cmd "$cmd" $data $lang $alidir fi if [ $stage -le 6 ]; then - steps/get_ctm.sh --min-lmwt 3 --max-lmwt 8 \ - $data data/$(basename $lang)_phone_bg $src/decode_$(basename $data)_phone_bg + steps/get_ctm.sh --use-segments false --cmd "$cmd" --min-lmwt 3 --max-lmwt 8 \ + $data $phone_lang $src/decode_$(basename $data)_phone_bg fi if [ $stage -le 7 ]; then mkdir -p $dir # lmwt=4 corresponds to the scale we decoded at. cp $src/decode_$(basename $data)_phone_bg/score_4/$(basename $data).ctm $dir/phone.ctm - cp ${src}_ali_$(basename $data)/ctm $dir/word.ctm + + cp $alidir/ctm $dir/word.ctm fi if [ $stage -le 8 ]; then @@ -82,7 +93,7 @@ if [ $stage -le 8 ]; then # we'll convert it into two entries like this, with the start and end separately: # sw02054-A 0021332 START and # sw02054-A 0021356 END and -# +# # and suppose phone.ctm has lines like # sw02054 A 213.09 0.24 sil # sw02054 A 213.33 0.13 ae_B @@ -95,18 +106,17 @@ if [ $stage -le 8 ]; then # then after sorting and merge-sorting the two ctm files we can easily # work out for each word, what the phones were during that time. - grep -v '' data/$(basename $lang)_phone_bg/phones.txt | awk '{print $1, $1}' | \ + grep -v '' $phone_lang/phones.txt | awk '{print $1, $1}' | \ sed 's/_B$//' | sed 's/_I$//' | sed 's/_E$//' | sed 's/_S$//' >$dir/phone_map.txt - silphone=$(cat data/$(basename $lang)_phone_bg/phones/optional_silence.txt) - cat $dir/phone.ctm | utils/apply_map.pl -f 5 $dir/phone_map.txt | grep -v "$silphone\$" > $dir/phone_cleaned.ctm + cat $dir/phone.ctm | utils/apply_map.pl -f 5 $dir/phone_map.txt > $dir/phone_text.ctm > $dir/phone_mapped.ctm export LC_ALL=C - + cat $dir/word.ctm | awk '{printf("%s-%s %09d START %s\n", $1, $2, 100*$3, $5); printf("%s-%s %09d END %s\n", $1, $2, 100*($3+$4), $5);}' | \ sort >$dir/word_processed.ctm - cat $dir/phone_cleaned.ctm | awk '{printf("%s-%s %09d PHONE %s\n", $1, $2, 100*($3+(0.5*$4)), $5);}' | \ + cat $dir/phone_mapped.ctm | awk '{printf("%s-%s %09d PHONE %s\n", $1, $2, 100*($3+(0.5*$4)), $5);}' | \ sort >$dir/phone_processed.ctm # merge-sort both ctm's @@ -129,12 +139,16 @@ if [ $stage -le 10 ]; then else cp $srcdict $dir/lexicon.txt fi + silphone=$(cat $phone_lang/phones/optional_silence.txt) + echo " $silphone" >> $dir/lexicon.txt awk '{count[$2] += $1;} END {for (w in count){print w, count[w];}}' \ <$dir/prons.txt >$dir/counts.txt + + cat $dir/prons.txt | \ - if $remove_stress; then + if $remove_stress; then perl -e 'while(<>) { @A=split(" ", $_); for ($n=1;$n<@A;$n++) { $A[$n] =~ s/[0-9]$//; } print join(" ", @A) . "\n"; } ' else cat @@ -143,9 +157,9 @@ if [ $stage -le 10 ]; then open(D, "<$ARGV[0]") || die "opening dict file $ARGV[0]"; # create a hash of all reference pronuncations, and for each word, record # a list of the prons, separated by " | ". - while () { - @A = split(" ", $_); $is_pron{join(" ",@A)} = 1; - $w = shift @A; + while () { + @A = split(" ", $_); $is_pron{join(" ",@A)} = 1; + $w = shift @A; if (!defined $prons{$w}) { $prons{$w} = join(" ", @A); } else { $prons{$w} = $prons{$w} . " | " . join(" ", @A); } } diff --git a/egs/wsj/s5/steps/cleanup/find_bad_utts.sh b/egs/wsj/s5/steps/cleanup/find_bad_utts.sh index 97fb62a9c4f..80a71b0edc5 100755 --- a/egs/wsj/s5/steps/cleanup/find_bad_utts.sh +++ b/egs/wsj/s5/steps/cleanup/find_bad_utts.sh @@ -5,9 +5,9 @@ # Computes training alignments using a model with delta or # LDA+MLLT features. This version, rather than just using the # text to align, computes mini-language models (unigram) from the text -# and a few common words in the LM, and allows +# and a few common words in the LM. -# Begin configuration section. +# Begin configuration section. nj=4 cmd=run.pl use_graphs=false @@ -82,7 +82,7 @@ echo "$0: feature type is $feat_type" case $feat_type in delta) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";; lda) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |" - cp $srcdir/final.mat $srcdir/full.mat $dir + cp $srcdir/final.mat $srcdir/full.mat $dir ;; *) echo "$0: invalid feature type $feat_type" && exit 1; esac @@ -155,7 +155,7 @@ if [ $stage -le 2 ]; then # # with the fields separated by tabs, e.g. # adg04_sr009_trn 1 12 SHOW THE GRIDLEY+S TRACK IN BRIGHT ORANGE WITH HORNE+S IN DIM RED AT SHOW THE GRIDLEY+S TRACK IN BRIGHT ORANGE WITH HORNE+S IN DIM RED - + paste $dir/edits.txt \ <(awk '{print $2}' $dir/length.txt) \ <(awk '{$1="";print;}' <$dir/aligned_ref.txt) \ @@ -171,9 +171,9 @@ fi if [ $stage -le 3 ]; then ### - # These stats migh help people figure out what is wrong with the data + # These stats might help people figure out what is wrong with the data # a)human-friendly and machine-parsable alignment in the file per_utt_details.txt - # b)evaluation of per-speaker performance to possibly find speakers with + # b)evaluation of per-speaker performance to possibly find speakers with # distinctive accents/speech disorders and similar # c)Global analysis on (Ins/Del/Sub) operation, which might be used to figure # out if there is systematic issue with lexicon, pronunciation or phonetic confusability diff --git a/egs/wsj/s5/steps/cleanup/find_bad_utts_nnet.sh b/egs/wsj/s5/steps/cleanup/find_bad_utts_nnet.sh new file mode 100755 index 00000000000..19beaca8914 --- /dev/null +++ b/egs/wsj/s5/steps/cleanup/find_bad_utts_nnet.sh @@ -0,0 +1,185 @@ +#!/bin/bash +# Copyright 2012-2014 Johns Hopkins University (Author: Daniel Povey) +# 2016 Api.ai (Author: Ilya Platonov) +# Apache 2.0 +# +# Tweaked version of find_bad_utts.sh to work with nnet2 and nnet3(supports chain models) non-ivector models. +# This script uses nnet-info and nnet3-am-info to determine type of nnet (nnet2 or nnet3). +# Use --acoustic-scale=1.0 for chain models. +# +# Begin configuration section. +nj=8 +cmd=run.pl +use_graphs=false +# Begin configuration. +scale_opts="--transition-scale=1.0 --self-loop-scale=0.1" +acoustic_scale=0.1 +beam=15.0 +lattice_beam=8.0 +max_active=750 +transform_dir= # directory to find fMLLR transforms in. +top_n_words=100 # Number of common words that we compile into each graph (most frequent + # in $lang/text. +stage=-1 +cleanup=true +# End configuration options. + +echo "$0 $@" # Print the command line for logging + +[ -f path.sh ] && . ./path.sh # source the path. +. parse_options.sh || exit 1; + +if [ $# != 4 ]; then + echo "usage: $0

" + echo "e.g.: $0 data/train data/lang exp/tri1 exp/tri1_debug" + echo "main options (for others, see top of script file)" + echo " --config # config containing options" + echo " --nj # number of parallel jobs" + echo " --use-graphs true # use graphs in src-dir" + echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs." + exit 1; +fi + +data=$1 +lang=$2 +srcdir=$3 +dir=$4 + +for f in $data/text $lang/oov.int $srcdir/tree $srcdir/final.mdl \ + $lang/L_disambig.fst $lang/phones/disambig.int; do + [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1; +done + +oov=`cat $lang/oov.int` || exit 1; +mkdir -p $dir/log +echo $nj > $dir/num_jobs +sdata=$data/split$nj +splice_opts=`cat $srcdir/splice_opts 2>/dev/null` # frame-splicing options. +cp $srcdir/splice_opts $dir 2>/dev/null # frame-splicing options. +cmvn_opts=`cat $srcdir/cmvn_opts 2>/dev/null` +cp $srcdir/cmvn_opts $dir 2>/dev/null # cmn/cmvn option. + +[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1; + +cp $srcdir/{tree,final.mdl} $dir || exit 1; + +#checking type of nnet +if nnet-info 1>/dev/null 2>/dev/null $srcdir/final.mdl; then + nnet_type="nnet"; + latgen_cmd="nnet-latgen-faster"; +elif nnet3-am-info 1>/dev/null 2>/dev/null $srcdir/final.mdl; then + nnet_type="nnet3" + frame_subsampling_factor=1; + nnet3_opt= + if [ -f $srcdir/frame_subsampling_factor ]; then + frame_subsampling_factor="$(cat $srcdir/frame_subsampling_factor)" + fi + if [ "$frame_subsamping_factor" != "1" ]; then + nnet3_opt="--frame-subsampling-factor=$frame_subsampling_factor"; + fi + latgen_cmd="nnet3-latgen-faster $nnet3_opt"; +else + echo "Unsupported type of nnet for $srcdir/final.mdl"; +fi + +echo "nnet type is $nnet_type"; + + +if [ $stage -le 0 ]; then + utils/sym2int.pl --map-oov $oov -f 2- $lang/words.txt <$data/text | \ + awk '{for(x=2;x<=NF;x++) print $x;}' | sort | uniq -c | \ + sort -rn > $dir/word_counts.int || exit 1; + num_words=$(awk '{x+=$1} END{print x}' < $dir/word_counts.int) || exit 1; + # print top-n words with their unigram probabilities. + + head -n $top_n_words $dir/word_counts.int | awk -v tot=$num_words '{print $1/tot, $2;}' >$dir/top_words.int + utils/int2sym.pl -f 2 $lang/words.txt <$dir/top_words.int >$dir/top_words.txt +fi + +echo "$0: feature type is raw" + +feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- |"; + +if [ $stage -le 1 ]; then + echo "$0: decoding $data using utterance-specific decoding graphs using model from $srcdir, output in $dir" + + rm $dir/edits.*.txt $dir/aligned_ref.*.txt 2>/dev/null + + $cmd JOB=1:$nj $dir/log/decode.JOB.log \ + utils/sym2int.pl --map-oov $oov -f 2- $lang/words.txt $sdata/JOB/text \| \ + steps/cleanup/make_utterance_fsts.pl $dir/top_words.int \| \ + compile-train-graphs-fsts $scale_opts --read-disambig-syms=$lang/phones/disambig.int \ + $dir/tree $dir/final.mdl $lang/L_disambig.fst ark:- ark:- \| \ + $latgen_cmd --acoustic-scale=$acoustic_scale --beam=$beam \ + --max-active=$max_active --lattice-beam=$lattice_beam \ + --word-symbol-table=$lang/words.txt \ + $dir/final.mdl ark:- "$feats" ark:- \| \ + lattice-oracle ark:- "ark:utils/sym2int.pl --map-oov $oov -f 2- $lang/words.txt $sdata/JOB/text|" \ + ark,t:- ark,t:$dir/edits.JOB.txt \| \ + utils/int2sym.pl -f 2- $lang/words.txt '>' $dir/aligned_ref.JOB.txt || exit 1; +fi + + +if [ $stage -le 2 ]; then + if [ -f $dir/edits.1.txt ]; then + # the awk commands below are to ensure that partially-written files don't confuse us. + for x in $(seq $nj); do cat $dir/edits.$x.txt; done | awk '{if(NF==2){print;}}' > $dir/edits.txt + for x in $(seq $nj); do cat $dir/aligned_ref.$x.txt; done | awk '{if(NF>=1){print;}}' > $dir/aligned_ref.txt + else + echo "$0: warning: no file $dir/edits.1.txt, using previously concatenated file if present." + fi + + # in case any utterances failed to align, get filtered copy of $data/text + utils/filter_scp.pl $dir/edits.txt < $data/text > $dir/text + cat $dir/text | awk '{print $1, (NF-1);}' > $dir/length.txt + + n1=$(wc -l < $dir/edits.txt) + n2=$(wc -l < $dir/aligned_ref.txt) + n3=$(wc -l < $dir/text) + n4=$(wc -l < $dir/length.txt) + if [ $n1 -ne $n2 ] || [ $n2 -ne $n3 ] || [ $n3 -ne $n4 ]; then + echo "$0: mismatch in lengths of files:" + wc $dir/edits.txt $dir/aligned_ref.txt $dir/text $dir/length.txt + exit 1; + fi + + # note: the format of all_info.txt is: + # + # with the fields separated by tabs, e.g. + # adg04_sr009_trn 1 12 SHOW THE GRIDLEY+S TRACK IN BRIGHT ORANGE WITH HORNE+S IN DIM RED AT SHOW THE GRIDLEY+S TRACK IN BRIGHT ORANGE WITH HORNE+S IN DIM RED + + paste $dir/edits.txt \ + <(awk '{print $2}' $dir/length.txt) \ + <(awk '{$1="";print;}' <$dir/aligned_ref.txt) \ + <(awk '{$1="";print;}' <$dir/text) > $dir/all_info.txt + + sort -nr -k2 $dir/all_info.txt > $dir/all_info.sorted.txt + + if $cleanup; then + rm $dir/edits.*.txt $dir/aligned_ref.*.txt + fi + +fi + +if [ $stage -le 3 ]; then + ### + # These stats migh help people figure out what is wrong with the data + # a)human-friendly and machine-parsable alignment in the file per_utt_details.txt + # b)evaluation of per-speaker performance to possibly find speakers with + # distinctive accents/speech disorders and similar + # c)Global analysis on (Ins/Del/Sub) operation, which might be used to figure + # out if there is systematic issue with lexicon, pronunciation or phonetic confusability + + mkdir -p $dir/analysis + align-text --special-symbol="***" ark:$dir/text ark:$dir/aligned_ref.txt ark,t:- | \ + utils/scoring/wer_per_utt_details.pl --special-symbol "***" > $dir/analysis/per_utt_details.txt + + cat $dir/analysis/per_utt_details.txt | \ + utils/scoring/wer_per_spk_details.pl $data/utt2spk > $dir/analysis/per_spk_details.txt + + cat $dir/analysis/per_utt_details.txt | \ + utils/scoring/wer_ops_details.pl --special-symbol "***" | \ + sort -i -b -k1,1 -k4,4nr -k2,2 -k3,3 > $dir/analysis/ops_details.txt + +fi + diff --git a/egs/wsj/s5/steps/cleanup/make_utterance_graph.sh b/egs/wsj/s5/steps/cleanup/make_utterance_graph.sh index 733eba34d10..a3b1e2af70a 100755 --- a/egs/wsj/s5/steps/cleanup/make_utterance_graph.sh +++ b/egs/wsj/s5/steps/cleanup/make_utterance_graph.sh @@ -111,10 +111,8 @@ while read line; do if (invoc[$x]) { printf("%s ", $x); } else { printf("%s ", oov); } } printf("\n"); }' > $wdir/text ngram-count -text $wdir/text -order $ngram_order "$srilm_options" -lm - |\ - arpa2fst - | fstprint | utils/eps2disambig.pl | utils/s2eps.pl |\ - fstcompile --isymbols=$lang/words.txt --osymbols=$lang/words.txt \ - --keep_isymbols=false --keep_osymbols=false |\ - fstrmepsilon | fstarcsort --sort_type=ilabel > $wdir/G.fst || exit 1; + arpa2fst --disambig-symbol=#0 \ + --read-symbol-table=$lang/words.txt - $wdir/G.fst || exit 1; fi fstisstochastic $wdir/G.fst || echo "$0: $uttid/G.fst not stochastic." @@ -134,7 +132,7 @@ while read line; do make-h-transducer --disambig-syms-out=$wdir/disambig_tid.int \ --transition-scale=$tscale $wdir/ilabels_${N}_${P} \ - $model_dir/tree $model_dir/final.mdl > $wdir/Ha.fst + $model_dir/tree $model_dir/final.mdl > $wdir/Ha.fst # Builds HCLGa.fst fsttablecompose $wdir/Ha.fst $wdir/CLG.fst | \ @@ -143,10 +141,10 @@ while read line; do fstminimizeencoded > $wdir/HCLGa.fst fstisstochastic $wdir/HCLGa.fst ||\ echo "$0: $uttid/HCLGa.fst is not stochastic" - + add-self-loops --self-loop-scale=$loopscale --reorder=true \ $model_dir/final.mdl < $wdir/HCLGa.fst > $wdir/HCLG.fst - + if [ $tscale == 1.0 -a $loopscale == 1.0 ]; then fstisstochastic $wdir/HCLG.fst ||\ echo "$0: $uttid/HCLG.fst is not stochastic." diff --git a/egs/wsj/s5/steps/conf/append_eval_to_ctm.py b/egs/wsj/s5/steps/conf/append_eval_to_ctm.py new file mode 100755 index 00000000000..3a35f5a9281 --- /dev/null +++ b/egs/wsj/s5/steps/conf/append_eval_to_ctm.py @@ -0,0 +1,70 @@ +#!/usr/bin/env python + +# Copyright 2015 Brno University of Technology (author: Karel Vesely) +# Apache 2.0 + +import sys,operator + +# Append Levenshtein alignment of 'hypothesis' and 'reference' into 'CTM': +# (i.e. the output of 'align-text' post-processed by 'wer_per_utt_details.pl') + +# The tags in the appended column are: +# 'C' = correct +# 'S' = substitution +# 'I' = insertion +# 'U' = unknown (not part of scored segment) + +if len(sys.argv) != 4: + print 'Usage: %s eval-in ctm-in ctm-eval-out' % __file__ + sys.exit(1) +dummy, eval_in, ctm_in, ctm_eval_out = sys.argv + +if ctm_eval_out == '-': ctm_eval_out = '/dev/stdout' + +# Read the evalutation, +eval_vec = dict() +with open(eval_in, 'r') as f: + while True: + # Reading 4 lines encoding one utterance, + ref = f.readline() + hyp = f.readline() + op = f.readline() + csid = f.readline() + if not ref: break + # Parse the input, + utt,tag,hyp_vec = hyp.split(' ',2) + assert(tag == 'hyp') + utt,tag,op_vec = op.split(' ',2) + assert(tag == 'op') + hyp_vec = hyp_vec.split() + op_vec = op_vec.split() + # Fill create eval vector with symbols 'C', 'S', 'I', + assert(utt not in eval_vec) + eval_vec[utt] = [] + for op,hyp in zip(op_vec, hyp_vec): + if hyp != '': eval_vec[utt].append(op) + +# Load the 'ctm' into dictionary, +ctm = dict() +with open(ctm_in) as f: + for l in f: + utt, ch, beg, dur, wrd, conf = l.split() + if not utt in ctm: ctm[utt] = [] + ctm[utt].append((utt, ch, float(beg), float(dur), wrd, float(conf))) + +# Build the 'ctm' with 'eval' column added, +ctm_eval = [] +for utt,ctm_part in ctm.iteritems(): + ctm_part.sort(key = operator.itemgetter(2)) # Sort by 'beg' time, + # extending the 'tuple' by '+': + merged = [ tup + (evl,) for tup,evl in zip(ctm_part,eval_vec[utt]) ] + ctm_eval.extend(merged) + +# Sort again, +ctm_eval.sort(key = operator.itemgetter(0,1,2)) + +# Store, +with open(ctm_eval_out,'w') as f: + for tup in ctm_eval: + f.write('%s %s %f %f %s %f %s\n' % tup) + diff --git a/egs/wsj/s5/steps/conf/append_prf_to_ctm.py b/egs/wsj/s5/steps/conf/append_prf_to_ctm.py new file mode 100755 index 00000000000..547b6176c9f --- /dev/null +++ b/egs/wsj/s5/steps/conf/append_prf_to_ctm.py @@ -0,0 +1,75 @@ +#!/usr/bin/env python + +# Copyright 2015 Brno University of Technology (author: Karel Vesely) +# Apache 2.0 + +import sys + +# Append Levenshtein alignment of 'hypothesis' and 'reference' into 'CTM': +# (parsed from the 'prf' output of 'sclite') + +# The tags in appended column are: +# 'C' = correct +# 'S' = substitution +# 'I' = insertion +# 'U' = unknown (not part of scored segment) + +# Parse options, +if len(sys.argv) != 4: + print "Usage: %s prf ctm_in ctm_out" % __file__ + sys.exit(1) +prf_file, ctm_file, ctm_out_file = sys.argv[1:] + +if ctm_out_file == '-': ctm_out_file = '/dev/stdout' + +# Load the prf file, +prf = [] +with open(prf_file) as f: + for l in f: + # Store the data, + if l[:5] == 'File:': + file_id = l.split()[1] + if l[:8] == 'Channel:': + chan = l.split()[1] + if l[:5] == 'H_T1:': + h_t1 = l + if l[:5] == 'Eval:': + evl = l + prf.append((file_id,chan,h_t1,evl)) + +# Parse the prf records into dictionary, +prf_dict = dict() +for (f,c,t,e) in prf: + t_pos = 0 # position in the 't' string, + while t_pos < len(t): + t1 = t[t_pos:].split(' ',1)[0] # get 1st token at 't_pos' + try: + # get word evaluation letter 'C,S,I', + evl = e[t_pos] if e[t_pos] != ' ' else 'C' + # add to dictionary, + key='%s,%s' % (f,c) # file,channel + if key not in prf_dict: prf_dict[key] = dict() + prf_dict[key][float(t1)] = evl + except ValueError: + pass + t_pos += len(t1)+1 # advance position for parsing, + +# Load the ctm file (with confidences), +with open(ctm_file) as f: + ctm = [ l.split() for l in f ] + +# Append the sclite alignment tags to ctm, +ctm_out = [] +for f, chan, beg, dur, wrd, conf in ctm: + # U = unknown, C = correct, S = substitution, I = insertion, + sclite_tag = 'U' + try: + sclite_tag = prf_dict[('%s,%s'%(f,chan)).lower()][float(beg)] + except KeyError: + pass + ctm_out.append([f,chan,beg,dur,wrd,conf,sclite_tag]) + +# Save the augmented ctm file, +with open(ctm_out_file, 'w') as f: + f.writelines([' '.join(ctm_record)+'\n' for ctm_record in ctm_out]) + diff --git a/egs/wsj/s5/steps/conf/apply_calibration.sh b/egs/wsj/s5/steps/conf/apply_calibration.sh new file mode 100755 index 00000000000..c1a22e274b8 --- /dev/null +++ b/egs/wsj/s5/steps/conf/apply_calibration.sh @@ -0,0 +1,91 @@ +#!/bin/bash +# Copyright 2015, Brno University of Technology (Author: Karel Vesely). Apache 2.0. + +# Trains logistic regression, which calibrates the per-word confidences, +# which are extracted by the Minimum Bayes Risk decoding. + +# begin configuration section. +cmd= +stage=0 +# end configuration section. + +[ -f ./path.sh ] && . ./path.sh +. parse_options.sh || exit 1; + +if [ $# -ne 5 ]; then + echo "Usage: $0 [opts] " + echo " Options:" + echo " --cmd (run.pl|queue.pl...) # specify how to run the sub-processes." + exit 1; +fi + +set -euo pipefail + +data=$1 +lang=$2 # Note: may be graph directory not lang directory, but has the necessary stuff copied. +latdir=$3 +caldir=$4 +dir=$5 + +model=$latdir/../final.mdl # assume model one level up from decoding dir. +calibration=$caldir/calibration.mdl +word_feats=$caldir/word_feats +word_categories=$caldir/word_categories + +for f in $lang/words.txt $word_feats $word_categories $latdir/lat.1.gz $calibration $model; do + [ ! -f $f ] && echo "$0: Missing file $f" && exit 1 +done +[ -z "$cmd" ] && echo "$0: Missing --cmd '...'" && exit 1 + +[ -d $dir/log ] || mkdir -p $dir/log +nj=$(cat $latdir/num_jobs) +lmwt=$(cat $caldir/lmwt) +decode_mbr=$(cat $caldir/decode_mbr) + +# Store the setup, +echo $lmwt >$dir/lmwt +echo $decode_mbr >$dir/decode_mbr +cp $calibration $dir/calibration.mdl +cp $word_feats $dir/word_feats +cp $word_categories $dir/word_categories + +# Create the ctm with raw confidences, +# - we keep the timing relative to the utterance, +if [ $stage -le 0 ]; then + $cmd JOB=1:$nj $dir/log/get_ctm.JOB.log \ + lattice-scale --inv-acoustic-scale=$lmwt "ark:gunzip -c $latdir/lat.JOB.gz|" ark:- \| \ + lattice-limit-depth ark:- ark:- \| \ + lattice-push --push-strings=false ark:- ark:- \| \ + lattice-align-words-lexicon --max-expand=10.0 \ + $lang/phones/align_lexicon.int $model ark:- ark:- \| \ + lattice-to-ctm-conf --decode-mbr=$decode_mbr ark:- - \| \ + utils/int2sym.pl -f 5 $lang/words.txt \ + '>' $dir/JOB.ctm + # Merge and clean, + for ((n=1; n<=nj; n++)); do cat $dir/${n}.ctm; done > $dir/ctm + rm $dir/*.ctm + cat $dir/ctm | utils/sym2int.pl -f 5 $lang/words.txt >$dir/ctm_int +fi + +# Compute lattice-depth, +latdepth=$dir/lattice_frame_depth.ark +if [ $stage -le 1 ]; then + [ -e $latdepth ] || steps/conf/lattice_depth_per_frame.sh --cmd "$cmd" $latdir $dir +fi + +# Create the forwarding data for logistic regression, +if [ $stage -le 2 ]; then + steps/conf/prepare_calibration_data.py --conf-feats $dir/forward_feats.ark \ + --lattice-depth $latdepth $dir/ctm_int $word_feats $word_categories +fi + +# Apply calibration model to dev, +if [ $stage -le 3 ]; then + logistic-regression-eval --apply-log=false $calibration \ + ark:$dir/forward_feats.ark ark,t:- | \ + awk '{ key=$1; p_corr=$4; sub(/,.*/,"",key); gsub(/\^/," ",key); print key,p_corr }' | \ + utils/int2sym.pl -f 5 $lang/words.txt \ + >$dir/ctm_calibrated +fi + +exit 0 diff --git a/egs/wsj/s5/steps/conf/convert_ctm_to_tra.py b/egs/wsj/s5/steps/conf/convert_ctm_to_tra.py new file mode 100755 index 00000000000..276d14b88f8 --- /dev/null +++ b/egs/wsj/s5/steps/conf/convert_ctm_to_tra.py @@ -0,0 +1,37 @@ +#!/usr/bin/env python + +# Copyright 2015 Brno University of Technology (author: Karel Vesely) +# Apache 2.0 + +import sys, operator + +# This scripts loads a 'ctm' file and converts it into the 'tra' format: +# "utt-key word1 word2 word3 ... wordN" +# The 'utt-key' is the 1st column in the CTM. + +# Typically the CTM contains: +# - utterance-relative timimng (i.e. prepared without 'utils/convert_ctm.pl') +# - confidences + +if len(sys.argv) != 3: + print 'Usage: %s ctm-in tra-out' % __file__ + sys.exit(1) +dummy, ctm_in, tra_out = sys.argv + +if ctm_in == '-': ctm_in = '/dev/stdin' +if tra_out == '-': tra_out = '/dev/stdout' + +# Load the 'ctm' into dictionary, +tra = dict() +with open(ctm_in) as f: + for l in f: + utt, ch, beg, dur, wrd, conf = l.split() + if not utt in tra: tra[utt] = [] + tra[utt].append((float(beg),wrd)) + +# Store the in 'tra' format, +with open(tra_out,'w') as f: + for utt,tuples in tra.iteritems(): + tuples.sort(key = operator.itemgetter(0)) # Sort by 'beg' time, + f.write('%s %s\n' % (utt,' '.join([t[1] for t in tuples]))) + diff --git a/egs/wsj/s5/steps/conf/lattice_depth_per_frame.sh b/egs/wsj/s5/steps/conf/lattice_depth_per_frame.sh new file mode 100755 index 00000000000..7167bd970bb --- /dev/null +++ b/egs/wsj/s5/steps/conf/lattice_depth_per_frame.sh @@ -0,0 +1,39 @@ +#!/bin/bash +# Copyright 2015 Brno University of Technology (Author: Karel Vesely) +# Licensed under the Apache License, Version 2.0 (the "License") + +# Extract lattice-depth for each frame. + +# Begin configuration +cmd=run.pl +# End configuration + +echo "$0 $@" # Print the command line for logging + +[ -f path.sh ] && . ./path.sh # source the path. +. parse_options.sh || exit 1; + +if [ $# != 2 ]; then + echo "usage: $0 [opts] " + echo "main options (for others, see top of script file)" + echo " --config # config containing options" + echo " --cmd" + exit 1; +fi + +set -euo pipefail + +latdir=$1 +dir=$2 + +[ ! -f $latdir/lat.1.gz ] && echo "Missing $latdir/lat.1.gz" && exit 1 +nj=$(cat $latdir/num_jobs) + +# Get the pdf-posterior vectors, +$cmd JOB=1:$nj $dir/log/lattice_depth_per_frame.JOB.log \ + lattice-depth-per-frame "ark:gunzip -c $latdir/lat.JOB.gz |" ark,t:$dir/lattice_frame_depth.JOB.ark +# Merge, +for ((n=1; n<=nj; n++)); do cat $dir/lattice_frame_depth.${n}.ark; done >$dir/lattice_frame_depth.ark +rm $dir/lattice_frame_depth.*.ark + +# Done! diff --git a/egs/wsj/s5/steps/conf/parse_arpa_unigrams.py b/egs/wsj/s5/steps/conf/parse_arpa_unigrams.py new file mode 100755 index 00000000000..1be32d4c4d7 --- /dev/null +++ b/egs/wsj/s5/steps/conf/parse_arpa_unigrams.py @@ -0,0 +1,38 @@ +#!/usr/bin/env python + +# Copyright 2015 Brno University of Technology (author: Karel Vesely) +# Apache 2.0 + +import sys, gzip, re + +# Parse options, +if len(sys.argv) != 4: + print "Usage: %s " % __file__ + sys.exit(0) +words_txt, arpa_gz, unigrams_out = sys.argv[1:] + +if arpa_gz == '-': arpa_gz = '/dev/stdin' +if unigrams_out == '-': unigrams_out = '/dev/stdout' + +# Load the words.txt, +words = [ l.split() for l in open(words_txt) ] + +# Load the unigram probabilities in 10log from ARPA, +wrd_log10 = dict() +with gzip.open(arpa_gz,'r') as f: + read = False + for l in f: + if l.strip() == '\\1-grams:': read = True + if l.strip() == '\\2-grams:': break + if read and len(l.split())>=2: + log10_p_unigram, wrd = re.split('[\t ]+',l.strip(),2)[:2] + wrd_log10[wrd] = float(log10_p_unigram) + +# Create list, 'wrd id log_p_unigram', +words_unigram = [[wrd, id, (wrd_log10[wrd] if wrd in wrd_log10 else -99)] for wrd,id in words ] + +print >>sys.stderr, words_unigram[0] +# Store, +with open(unigrams_out,'w') as f: + f.writelines(['%s %s %g\n' % (w,i,p) for (w,i,p) in words_unigram]) + diff --git a/egs/wsj/s5/steps/conf/prepare_calibration_data.py b/egs/wsj/s5/steps/conf/prepare_calibration_data.py new file mode 100755 index 00000000000..bc8f92a2f7f --- /dev/null +++ b/egs/wsj/s5/steps/conf/prepare_calibration_data.py @@ -0,0 +1,119 @@ +#!/usr/bin/env python + +# Copyright 2015 Brno University of Technology (author: Karel Vesely) +# Apache 2.0 + +import sys, math + +from optparse import OptionParser +desc = """ +Prepare input features and training targets for logistic regression, +which calibrates the Minimum Bayes Risk posterior confidences. + +The logisitc-regression input features are: +- posteriors from 'ctm' transformed by logit, +- logarithm of word-length in letters, +- 10base logarithm of unigram probability of a word from language model, +- logarithm of average lattice-depth at position of the word (optional), + +The logistic-regresion targets are: +- 1 for correct word, +- 0 for incorrect word (substitution, insertion), + +The iput 'ctm' is augmented by per-word tags (or 'U' is added if no tags), +'C' = correct +'S' = substitution +'I' = insertion +'U' = unknown (not part of scored segment) + +The script can be used both to prepare the training data, +or to prepare input features for forwarding through trained model. +""" +usage = "%prog [opts] ctm word-filter word-length unigrams depth-per-frame-ascii.ark word-categories" +parser = OptionParser(usage=usage, description=desc) +parser.add_option("--conf-targets", help="Targets file for logistic regression (no targets generated if '') [default %default]", default='') +parser.add_option("--conf-feats", help="Feature file for logistic regression. [default %default]", default='') +parser.add_option("--lattice-depth", help="Per-frame lattice depths, ascii-ark (optional). [default %default]", default='') +(o, args) = parser.parse_args() + +if len(args) != 3: + parser.print_help() + sys.exit(1) +ctm_file, word_feats_file, word_categories_file = args + +assert(o.conf_feats != '') + +# Load the ctm (optionally add eval colmn with 'U'): +ctm = [ l.split() for l in open(ctm_file) ] +if len(ctm[0]) == 6: [ l.append('U') for l in ctm ] +assert(len(ctm[0]) == 7) + +# Load the word-features, the format: "wrd wrd_id filter length other_feats" +# (typically 'other_feats' are unigram log-probabilities), +word_feats = [ l.split(None,4) for l in open(word_feats_file) ] + +# Prepare filtering dict, +word_filter = { wrd_id:bool(int(filter)) for (wrd,wrd_id,filter,length,other_feats) in word_feats } +# Prepare the lenght dict, +word_length = { wrd_id:float(length) for (wrd,wrd_id,filter,length,other_feats) in word_feats } +# Prepare other_feats dict, +other_feats = { wrd_id:other_feats.strip() for (wrd,wrd_id,filter,length,other_feats) in word_feats } + +# Build the targets, +if o.conf_targets != '': + with open(o.conf_targets,'w') as f: + for (utt, chan, beg, dur, wrd_id, conf, score_tag) in ctm: + # Skip the words we don't know if being correct, + if score_tag == 'U': continue + # Some words are excluded from training (partial words, hesitations, etc.), + # (Value: 1 == keep word, 0 == exclude word from the targets), + if not word_filter[wrd_id]: continue + # Build the key, + key = "%s^%s^%s^%s^%s,%s,%s" % (utt, chan, beg, dur, wrd_id, conf, score_tag) + # Build the target, + tgt = 1 if score_tag == 'C' else 0 # Correct = 1, else 0, + # Write, + f.write('%s %d\n' % (key,tgt)) + +# Load the per-frame lattice-depth, +# - we assume, the 1st column in 'ctm' is the 'utterance-key' in depth file, +# - if the 'ctm' and 'ark' keys don't match, we leave this feature out, +if o.lattice_depth: + depths = dict() + for l in open(o.lattice_depth): + utt,d = l.split(' ',1) + depths[utt] = map(int,d.split()) + +# Load the 'word_categories' mapping for categorical input features derived from 'lang/words.txt', +wrd_to_cat = [ l.split() for l in open(word_categories_file) ] +wrd_to_cat = { wrd_id:int(category) for wrd,wrd_id,category in wrd_to_cat } +wrd_cat_num = max(wrd_to_cat.values()) + 1 + +# Build the input features, +with open(o.conf_feats,'w') as f: + for (utt, chan, beg, dur, wrd_id, conf, score_tag) in ctm: + # Build the key, same as previously, + key = "%s^%s^%s^%s^%s,%s,%s" % (utt, chan, beg, dur, wrd_id, conf, score_tag) + + # Build input features, + # - logit of MBR posterior, + damper = 0.001 # avoid -inf,+inf from log, + logit = math.log(float(conf)+damper) - math.log(1.0 - float(conf)+damper) + # - log of word-length, + log_word_length = math.log(word_length[wrd_id]) # i.e. number of phones in a word, + # - categorical distribution of words (with frequency higher than min-count), + wrd_1_of_k = [0]*wrd_cat_num; + wrd_1_of_k[wrd_to_cat[wrd_id]] = 1; + + # Compose the input feature vector, + feats = [ logit, log_word_length, other_feats[wrd_id] ] + wrd_1_of_k + + # Optionally add average-depth of lattice at the word position, + if o.lattice_depth != '': + depth_slice = depths[utt][int(round(100.0*float(beg))):int(round(100.0*(float(beg)+float(dur))))] + log_avg_depth = math.log(float(sum(depth_slice))/len(depth_slice)) + feats += [ log_avg_depth ] + + # Store the input features, + f.write(key + ' [ ' + ' '.join(map(str,feats)) + ' ]\n') + diff --git a/egs/wsj/s5/steps/conf/prepare_word_categories.py b/egs/wsj/s5/steps/conf/prepare_word_categories.py new file mode 100755 index 00000000000..3b758001c5a --- /dev/null +++ b/egs/wsj/s5/steps/conf/prepare_word_categories.py @@ -0,0 +1,56 @@ +#!/usr/bin/env python + +# Copyright 2015 Brno University of Technology (author: Karel Vesely) +# Apache 2.0 + +import sys + +from optparse import OptionParser +desc = """ +Prepare mapping of words into categories. Each word with minimal frequency +has its own category, the rest is merged into single class. +""" +usage = "%prog [opts] words.txt ctm category_mapping" +parser = OptionParser(usage=usage, description=desc) +parser.add_option("--min-count", help="Minimum word-count to have a single word category. [default %default]", type='int', default=20) +(o, args) = parser.parse_args() + +if len(args) != 3: + parser.print_help() + sys.exit(1) +words_file, text_file, category_mapping_file = args + +if text_file == '-': text_file = '/dev/stdin' +if category_mapping_file == '-': category_mapping_file = '/dev/stdout' + +# Read the words from the 'tra' file, +with open(text_file) as f: + text_words = [ l.split()[1:] for l in f ] + +# Flatten the array of arrays of words, +import itertools +text_words = list(itertools.chain.from_iterable(text_words)) + +# Count the words (regardless if correct or incorrect), +word_counts = dict() +for w in text_words: + if w not in word_counts: word_counts[w] = 0 + word_counts[w] += 1 + +# Read the words.txt, +with open(words_file) as f: + word_id = [ l.split() for l in f ] + +# Append the categories, +n=1 +word_id_cat=[] +for word, idx in word_id: + cat = 0 + if word in word_counts: + if word_counts[word] > o.min_count: + cat = n; n += 1 + word_id_cat.append([word, idx, str(cat)]) + +# Store the mapping, +with open(category_mapping_file,'w') as f: + f.writelines([' '.join(record)+'\n' for record in word_id_cat]) diff --git a/egs/wsj/s5/steps/conf/train_calibration.sh b/egs/wsj/s5/steps/conf/train_calibration.sh new file mode 100755 index 00000000000..c2aca05056e --- /dev/null +++ b/egs/wsj/s5/steps/conf/train_calibration.sh @@ -0,0 +1,125 @@ +#!/bin/bash +# Copyright 2015, Brno University of Technology (Author: Karel Vesely). Apache 2.0. + +# Trains logistic regression, which calibrates the per-word confidences in 'CTM'. +# The 'raw' confidences are obtained by Minimum Bayes Risk decoding. + +# The input features of logistic regression are: +# - logit of Minumum Bayer Risk posterior +# - log of word-length in characters +# - log of average-depth depth of a lattice at words' position +# - log of frames per character ratio +# (- categorical distribution of 'lang/words.txt', DISABLED) + +# begin configuration section. +cmd= +lmwt=12 +decode_mbr=true +word_min_count=10 # Minimum word-count for single-word category, +normalizer=0.0025 # L2 regularization constant, +category_text= # Alternative corpus for counting words to get word-categories (by default using 'ctm'), +stage=0 +# end configuration section. + +[ -f ./path.sh ] && . ./path.sh +. parse_options.sh || exit 1; + +if [ $# -ne 5 ]; then + echo "Usage: $0 [opts] " + echo " Options:" + echo " --cmd (run.pl|queue.pl...) # specify how to run the sub-processes." + echo " --lmwt # scaling for confidence extraction" + echo " --decode-mbr # use Minimum Bayes Risk decoding" + echo " --grep-filter # remove words from calibration targets" + exit 1; +fi + +set -euo pipefail + +data=$1 +lang=$2 # Note: may be graph directory not lang directory, but has the necessary stuff copied. +word_feats=$3 +latdir=$4 +dir=$5 + +model=$latdir/../final.mdl # assume model one level up from decoding dir. + +for f in $data/text $lang/words.txt $word_feats $latdir/lat.1.gz; do + [ ! -f $f ] && echo "$0: Missing file $f" && exit 1 +done +[ -z "$cmd" ] && echo "$0: Missing --cmd '...'" && exit 1 + +[ -d $dir/log ] || mkdir -p $dir/log +nj=$(cat $latdir/num_jobs) + +# Store the setup, +echo $lmwt >$dir/lmwt +echo $decode_mbr >$dir/decode_mbr +cp $word_feats $dir/word_feats + +# Create the ctm with raw confidences, +# - we keep the timing relative to the utterance, +if [ $stage -le 0 ]; then + $cmd JOB=1:$nj $dir/log/get_ctm.JOB.log \ + lattice-scale --inv-acoustic-scale=$lmwt "ark:gunzip -c $latdir/lat.JOB.gz|" ark:- \| \ + lattice-limit-depth ark:- ark:- \| \ + lattice-push --push-strings=false ark:- ark:- \| \ + lattice-align-words-lexicon --max-expand=10.0 \ + $lang/phones/align_lexicon.int $model ark:- ark:- \| \ + lattice-to-ctm-conf --decode-mbr=$decode_mbr ark:- - \| \ + utils/int2sym.pl -f 5 $lang/words.txt \ + '>' $dir/JOB.ctm + # Merge and clean, + for ((n=1; n<=nj; n++)); do cat $dir/${n}.ctm; done > $dir/ctm + rm $dir/*.ctm +fi + +# Get evaluation of the 'ctm' using the 'text' reference, +if [ $stage -le 1 ]; then + steps/conf/convert_ctm_to_tra.py $dir/ctm - | \ + align-text --special-symbol="" ark:$data/text ark:- ark,t:- | \ + utils/scoring/wer_per_utt_details.pl --special-symbol "" \ + >$dir/align_text + # Append alignment to ctm, + steps/conf/append_eval_to_ctm.py $dir/align_text $dir/ctm $dir/ctm_aligned + # Convert words to 'ids', + cat $dir/ctm_aligned | utils/sym2int.pl -f 5 $lang/words.txt >$dir/ctm_aligned_int +fi + +# Prepare word-categories (based on wotd frequencies in 'ctm'), +if [ -z "$category_text" ]; then + steps/conf/convert_ctm_to_tra.py $dir/ctm - | \ + steps/conf/prepare_word_categories.py --min-count $word_min_count $lang/words.txt - $dir/word_categories +else + steps/conf/prepare_word_categories.py --min-count $word_min_count $lang/words.txt "$category_text" $dir/word_categories +fi + +# Compute lattice-depth, +latdepth=$dir/lattice_frame_depth.ark +if [ $stage -le 2 ]; then + [ -e $latdepth ] || steps/conf/lattice_depth_per_frame.sh --cmd "$cmd" $latdir $dir +fi + +# Create the training data for logistic regression, +if [ $stage -le 3 ]; then + steps/conf/prepare_calibration_data.py \ + --conf-targets $dir/train_targets.ark --conf-feats $dir/train_feats.ark \ + --lattice-depth $latdepth $dir/ctm_aligned_int $word_feats $dir/word_categories +fi + +# Train the logistic regression, +if [ $stage -le 4 ]; then + logistic-regression-train --binary=false --normalizer=$normalizer ark:$dir/train_feats.ark \ + ark:$dir/train_targets.ark $dir/calibration.mdl 2>$dir/log/logistic-regression-train.log +fi + +# Apply calibration model to dev, +if [ $stage -le 5 ]; then + logistic-regression-eval --apply-log=false $dir/calibration.mdl \ + ark:$dir/train_feats.ark ark,t:- | \ + awk '{ key=$1; p_corr=$4; sub(/,.*/,"",key); gsub(/\^/," ",key); print key,p_corr }' | \ + utils/int2sym.pl -f 5 $lang/words.txt \ + >$dir/ctm_calibrated_int +fi + +exit 0 diff --git a/egs/wsj/s5/steps/decode.sh b/egs/wsj/s5/steps/decode.sh index b0e2fed2017..f2bc1d367fd 100755 --- a/egs/wsj/s5/steps/decode.sh +++ b/egs/wsj/s5/steps/decode.sh @@ -3,8 +3,8 @@ # Copyright 2012 Johns Hopkins University (Author: Daniel Povey) # Apache 2.0 -# Begin configuration section. -transform_dir= # this option won't normally be used, but it can be used if you want to +# Begin configuration section. +transform_dir= # this option won't normally be used, but it can be used if you want to # supply existing fMLLR transforms when decoding. iter= model= # You can specify the model to use (e.g. if you want to use the .alimdl) @@ -64,16 +64,16 @@ mkdir -p $dir/log echo $nj > $dir/num_jobs if [ -z "$model" ]; then # if --model was not specified on the command line... - if [ -z $iter ]; then model=$srcdir/final.mdl; + if [ -z $iter ]; then model=$srcdir/final.mdl; else model=$srcdir/$iter.mdl; fi fi if [ $(basename $model) != final.alimdl ] ; then # Do not use the $srcpath -- look at the path where the model is - if [ -f $(dirname $model)/final.alimdl ] ; then - echo -e '\n\n' - echo $0 'WARNING: Running speaker independent system decoding using a SAT model!' - echo $0 'WARNING: This is OK if you know what you are doing...' + if [ -f $(dirname $model)/final.alimdl ] && [ -z "$transform_dir" ]; then + echo -e '\n\n' + echo $0 'WARNING: Running speaker independent system decoding using a SAT model!' + echo $0 'WARNING: This is OK if you know what you are doing...' echo -e '\n\n' fi fi @@ -90,7 +90,7 @@ cmvn_opts=`cat $srcdir/cmvn_opts 2>/dev/null` delta_opts=`cat $srcdir/delta_opts 2>/dev/null` thread_string= -[ $num_threads -gt 1 ] && thread_string="-parallel --num-threads=$num_threads" +[ $num_threads -gt 1 ] && thread_string="-parallel --num-threads=$num_threads" case $feat_type in delta) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas $delta_opts ark:- ark:- |";; @@ -129,7 +129,7 @@ fi if ! $skip_scoring ; then [ ! -x local/score.sh ] && \ echo "Not scoring because local/score.sh does not exist or not executable." && exit 1; - local/score.sh --cmd "$cmd" $scoring_opts $data $graphdir $dir || + local/score.sh --cmd "$cmd" $scoring_opts $data $graphdir $dir || { echo "$0: Scoring failed. (ignore by '--skip-scoring true')"; exit 1; } fi diff --git a/egs/wsj/s5/steps/decode_basis_fmllr.sh b/egs/wsj/s5/steps/decode_basis_fmllr.sh index d0d37aed016..afb914e7f0d 100755 --- a/egs/wsj/s5/steps/decode_basis_fmllr.sh +++ b/egs/wsj/s5/steps/decode_basis_fmllr.sh @@ -95,6 +95,7 @@ mkdir -p $dir/log echo $nj > $dir/num_jobs splice_opts=`cat $srcdir/splice_opts 2>/dev/null` # frame-splicing options. cmvn_opts=`cat $srcdir/cmvn_opts 2>/dev/null` +delta_opts=`cat $srcdir/delta_opts 2>/dev/null` silphonelist=`cat $graphdir/phones/silence.csl` || exit 1; @@ -144,7 +145,7 @@ done if [ -f $srcdir/final.mat ]; then feat_type=lda; else feat_type=delta; fi echo "$0: feature type is $feat_type"; case $feat_type in - delta) sifeats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";; + delta) sifeats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas $delta_opts ark:- ark:- |";; lda) sifeats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |";; *) echo "Invalid feature type $feat_type" && exit 1; esac diff --git a/egs/wsj/s5/steps/decode_biglm.sh b/egs/wsj/s5/steps/decode_biglm.sh index 9146ab8cebf..0663391430d 100755 --- a/egs/wsj/s5/steps/decode_biglm.sh +++ b/egs/wsj/s5/steps/decode_biglm.sh @@ -45,6 +45,7 @@ srcdir=`dirname $dir`; # The model directory is one level up from decoding direc sdata=$data/split$nj; splice_opts=`cat $srcdir/splice_opts 2>/dev/null` cmvn_opts=`cat $srcdir/cmvn_opts 2>/dev/null` +delta_opts=`cat $srcdir/delta_opts 2>/dev/null` mkdir -p $dir/log [[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1; @@ -60,7 +61,7 @@ if [ -f $srcdir/final.mat ]; then feat_type=lda; else feat_type=delta; fi echo "decode_si.sh: feature type is $feat_type" case $feat_type in - delta) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";; + delta) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas $delta_opts ark:- ark:- |";; lda) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |";; *) echo "Invalid feature type $feat_type" && exit 1; esac diff --git a/egs/wsj/s5/steps/decode_combine.sh b/egs/wsj/s5/steps/decode_combine.sh index ca4f84efdc7..e2926ee0e3a 100755 --- a/egs/wsj/s5/steps/decode_combine.sh +++ b/egs/wsj/s5/steps/decode_combine.sh @@ -47,7 +47,7 @@ mkdir -p $dir/log echo $nj > $dir/num_jobs # The lattice-interp command does the score interpolation (with composition), -# and the lattice-copy-backoff replaces the result with the 1st lattice, in +# and the lattice-copy-backoff replaces the result with the 1st lattice, in # cases where the composed result was empty. $cmd JOB=1:$nj $dir/log/interp.JOB.log \ lattice-interp --alpha=$weight1 "ark:gunzip -c $srcdir1/lat.JOB.gz|" \ @@ -55,6 +55,8 @@ $cmd JOB=1:$nj $dir/log/interp.JOB.log \ lattice-copy-backoff "ark,s,cs:gunzip -c $srcdir1/lat.JOB.gz|" ark,s,cs:- \ "ark:|gzip -c >$dir/lat.JOB.gz" || exit 1; +cp $srcdir1/final.mdl $dir/final.mdl + if ! $skip_scoring ; then [ ! -x local/score.sh ] && \ echo "Not scoring because local/score.sh does not exist or not executable." && exit 1; diff --git a/egs/wsj/s5/steps/decode_fmmi.sh b/egs/wsj/s5/steps/decode_fmmi.sh index b655d076698..5460d37ff28 100755 --- a/egs/wsj/s5/steps/decode_fmmi.sh +++ b/egs/wsj/s5/steps/decode_fmmi.sh @@ -58,6 +58,7 @@ srcdir=`dirname $dir`; # The model directory is one level up from decoding direc sdata=$data/split$nj; splice_opts=`cat $srcdir/splice_opts 2>/dev/null` cmvn_opts=`cat $srcdir/cmvn_opts 2>/dev/null` +delta_opts=`cat $srcdir/delta_opts 2>/dev/null` thread_string= [ $num_threads -gt 1 ] && thread_string="-parallel --num-threads=$num_threads" @@ -75,7 +76,7 @@ if [ -f $srcdir/final.mat ]; then feat_type=lda; else feat_type=delta; fi echo "decode_fmmi.sh: feature type is $feat_type"; case $feat_type in - delta) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";; + delta) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas $delta_opts ark:- ark:- |";; lda) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |";; *) echo "Invalid feature type $feat_type" && exit 1; esac diff --git a/egs/wsj/s5/steps/decode_fwdbwd.sh b/egs/wsj/s5/steps/decode_fwdbwd.sh index 27c2d483301..f0e36227251 100755 --- a/egs/wsj/s5/steps/decode_fwdbwd.sh +++ b/egs/wsj/s5/steps/decode_fwdbwd.sh @@ -75,9 +75,10 @@ echo "decode_fwdbwd.sh: feature type is $feat_type"; splice_opts=`cat $srcdir/splice_opts 2>/dev/null` cmvn_opts=`cat $srcdir/cmvn_opts 2>/dev/null` +delta_opts=`cat $srcdir/delta_opts 2>/dev/null` case $feat_type in - delta) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";; + delta) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas $delta_opts ark:- ark:- |";; lda) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |";; *) echo "Invalid feature type $feat_type" && exit 1; esac diff --git a/egs/wsj/s5/steps/decode_nolats.sh b/egs/wsj/s5/steps/decode_nolats.sh index 6f5e780cf30..9c05d3eea30 100755 --- a/egs/wsj/s5/steps/decode_nolats.sh +++ b/egs/wsj/s5/steps/decode_nolats.sh @@ -83,9 +83,10 @@ echo "decode.sh: feature type is $feat_type"; splice_opts=`cat $srcdir/splice_opts 2>/dev/null` cmvn_opts=`cat $srcdir/cmvn_opts 2>/dev/null` +delta_opts=`cat $srcdir/delta_opts 2>/dev/null` case $feat_type in - delta) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";; + delta) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas $delta_opts ark:- ark:- |";; lda) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |";; *) echo "Invalid feature type $feat_type" && exit 1; esac diff --git a/egs/wsj/s5/steps/decode_with_map.sh b/egs/wsj/s5/steps/decode_with_map.sh index e05e4de4097..ab507debd11 100755 --- a/egs/wsj/s5/steps/decode_with_map.sh +++ b/egs/wsj/s5/steps/decode_with_map.sh @@ -71,9 +71,10 @@ echo "decode.sh: feature type is $feat_type"; splice_opts=`cat $srcdir/splice_opts 2>/dev/null` cmvn_opts=`cat $srcdir/cmvn_opts 2>/dev/null` +delta_opts=`cat $srcdir/delta_opts 2>/dev/null` case $feat_type in - delta) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";; + delta) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas $delta_opts ark:- ark:- |";; lda) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |";; *) echo "Invalid feature type $feat_type" && exit 1; esac diff --git a/egs/wsj/s5/steps/get_ctm.sh b/egs/wsj/s5/steps/get_ctm.sh index 3d0ea576a57..2f2f6794e3d 100755 --- a/egs/wsj/s5/steps/get_ctm.sh +++ b/egs/wsj/s5/steps/get_ctm.sh @@ -8,6 +8,7 @@ # begin configuration section. cmd=run.pl stage=0 +frame_shift=0.01 min_lmwt=5 max_lmwt=20 use_segments=true # if we have a segments file, use it to convert @@ -28,6 +29,8 @@ if [ $# -ne 3 ]; then echo " # to produce a ctm relative to the original audio" echo " # files, with channel information (typically needed" echo " # for NIST scoring)." + echo " --frame-shift (default=0.01) # specify this if your lattices have a frame-shift" + echo " # not equal to 0.01 seconds" echo "e.g.:" echo "$0 data/train data/lang exp/tri4a/decode/" echo "See also: steps/get_train_ctm.sh" @@ -55,7 +58,7 @@ if [ $stage -le 0 ]; then [ ! -f $f ] && echo "$0: expecting file $f to exist" && exit 1; filter_cmd="utils/convert_ctm.pl $data/segments $data/reco2file_and_channel" else - filter_cmd=cat + filter_cmd=cat fi if [ -f $lang/phones/word_boundary.int ]; then @@ -63,7 +66,7 @@ if [ $stage -le 0 ]; then set -o pipefail '&&' mkdir -p $dir/score_LMWT/ '&&' \ lattice-1best --lm-scale=LMWT "ark:gunzip -c $dir/lat.*.gz|" ark:- \| \ lattice-align-words $lang/phones/word_boundary.int $model ark:- ark:- \| \ - nbest-to-ctm ark:- - \| \ + nbest-to-ctm --frame-shift=$frame_shift ark:- - \| \ utils/int2sym.pl -f 5 $lang/words.txt \| \ $filter_cmd '>' $dir/score_LMWT/$name.ctm || exit 1; else @@ -76,7 +79,7 @@ if [ $stage -le 0 ]; then set -o pipefail '&&' mkdir -p $dir/score_LMWT/ '&&' \ lattice-1best --lm-scale=LMWT "ark:gunzip -c $dir/lat.*.gz|" ark:- \| \ lattice-align-words-lexicon $lang/phones/align_lexicon.int $model ark:- ark:- \| \ - nbest-to-ctm ark:- - \| \ + nbest-to-ctm --frame-shift=$frame_shift ark:- - \| \ utils/int2sym.pl -f 5 $lang/words.txt \| \ $filter_cmd '>' $dir/score_LMWT/$name.ctm || exit 1; fi diff --git a/egs/wsj/s5/steps/get_train_ctm.sh b/egs/wsj/s5/steps/get_train_ctm.sh index a6cbb2ac06a..10b29708d84 100755 --- a/egs/wsj/s5/steps/get_train_ctm.sh +++ b/egs/wsj/s5/steps/get_train_ctm.sh @@ -7,9 +7,12 @@ # begin configuration section. cmd=run.pl +frame_shift=0.01 stage=0 use_segments=true # if we have a segments file, use it to convert # the segments to be relative to the original files. +print_silence=false # if true, will print (optional-silence) arcs. + #end configuration section. echo "$0 $@" # Print the command line for logging @@ -26,6 +29,8 @@ if [ $# -ne 3 ]; then echo " # to produce a ctm relative to the original audio" echo " # files, with channel information (typically needed" echo " # for NIST scoring)." + echo " --frame-shift (default=0.01) # specify this if your alignments have a frame-shift" + echo " # not equal to 0.01 seconds" echo "e.g.:" echo "$0 data/train data/lang exp/tri3a_ali" echo "Produces ctm in: exp/tri3a_ali/ctm" @@ -58,9 +63,9 @@ if [ $stage -le 0 ]; then "ark:utils/sym2int.pl --map-oov $oov -f 2- $lang/words.txt < $sdata/JOB/text |" \ '' '' ark:- \| \ lattice-align-words $lang/phones/word_boundary.int $model ark:- ark:- \| \ - nbest-to-ctm ark:- - \| \ + nbest-to-ctm --frame-shift=$frame_shift --print-silence=$print_silence ark:- - \| \ utils/int2sym.pl -f 5 $lang/words.txt \| \ - gzip -c '>' $dir/ctm.JOB.gz + gzip -c '>' $dir/ctm.JOB.gz || exit 1 else if [ ! -f $lang/phones/align_lexicon.int ]; then echo "$0: neither $lang/phones/word_boundary.int nor $lang/phones/align_lexicon.int exists: cannot align." @@ -71,14 +76,14 @@ if [ $stage -le 0 ]; then "ark:utils/sym2int.pl --map-oov $oov -f 2- $lang/words.txt < $sdata/JOB/text |" \ '' '' ark:- \| \ lattice-align-words-lexicon $lang/phones/align_lexicon.int $model ark:- ark:- \| \ - nbest-to-ctm ark:- - \| \ + nbest-to-ctm --frame-shift=$frame_shift --print-silence=$print_silence ark:- - \| \ utils/int2sym.pl -f 5 $lang/words.txt \| \ - gzip -c '>' $dir/ctm.JOB.gz + gzip -c '>' $dir/ctm.JOB.gz || exit 1 fi fi if [ $stage -le 1 ]; then - if [ -f $data/segments ]; then + if [ -f $data/segments ] && $use_segments; then f=$data/reco2file_and_channel [ ! -f $f ] && echo "$0: expecting file $f to exist" && exit 1; for n in `seq $nj`; do gunzip -c $dir/ctm.$n.gz; done | \ diff --git a/egs/wsj/s5/steps/lmrescore.sh b/egs/wsj/s5/steps/lmrescore.sh index 0652c6c13ca..86595e862b9 100755 --- a/egs/wsj/s5/steps/lmrescore.sh +++ b/egs/wsj/s5/steps/lmrescore.sh @@ -4,6 +4,7 @@ mode=4 cmd=run.pl skip_scoring=false +self_loop_scale=0.1 # End configuration section. echo "$0 $@" # Print the command line for logging @@ -32,6 +33,10 @@ newlm=$newlang/G.fst [ ! -f $newlm ] && echo Missing file $newlm && exit 1; ! ls $indir/lat.*.gz >/dev/null && echo "No lattices input directory $indir" && exit 1; +if ! cmp -s $oldlang/words.txt $newlang/words.txt; then + echo "$0: $oldlang/words.txt and $newlang/words.txt differ: make sure you know what you are doing."; +fi + oldlmcommand="fstproject --project_output=true $oldlm |" newlmcommand="fstproject --project_output=true $newlm |" @@ -75,7 +80,7 @@ case "$mode" in gzip -c \>$outdir/lat.JOB.gz || exit 1; ;; 3) # 3 is "exact" in that we remove the old LM scores accepting any path - # through G.fst (which is what we want as that happened in lattice + # through G.fst (which is what we want as that happened in lattice # generation), but we add the new one with "phi matcher", only taking # backoff arcs if an explicit arc did not exist. $cmd JOB=1:$nj $outdir/log/rescorelm.JOB.log \ @@ -100,7 +105,7 @@ case "$mode" in lattice-compose ark:- $outdir/Ldet.fst ark:- \| \ lattice-determinize ark:- ark:- \| \ lattice-compose --phi-label=$phi ark:- $newlm ark:- \| \ - lattice-add-trans-probs --transition-scale=1.0 --self-loop-scale=0.1 \ + lattice-add-trans-probs --transition-scale=1.0 --self-loop-scale=$self_loop_scale \ $mdl ark:- ark:- \| \ gzip -c \>$outdir/lat.JOB.gz || exit 1; ;; diff --git a/egs/wsj/s5/steps/lmrescore_const_arpa.sh b/egs/wsj/s5/steps/lmrescore_const_arpa.sh index 3d70d41e59e..81698f07f0d 100755 --- a/egs/wsj/s5/steps/lmrescore_const_arpa.sh +++ b/egs/wsj/s5/steps/lmrescore_const_arpa.sh @@ -8,6 +8,8 @@ # Begin configuration section. cmd=run.pl skip_scoring=false +stage=1 +scoring_opts= # End configuration section. echo "$0 $@" # Print the command line for logging @@ -39,22 +41,28 @@ newlm=$newlang/G.carpa ! ls $indir/lat.*.gz >/dev/null &&\ echo "$0: No lattices input directory $indir" && exit 1; +if ! cmp -s $oldlang/words.txt $newlang/words.txt; then + echo "$0: $oldlang/words.txt and $newlang/words.txt differ: make sure you know what you are doing."; +fi + oldlmcommand="fstproject --project_output=true $oldlm |" mkdir -p $outdir/log nj=`cat $indir/num_jobs` || exit 1; cp $indir/num_jobs $outdir -$cmd JOB=1:$nj $outdir/log/rescorelm.JOB.log \ - lattice-lmrescore --lm-scale=-1.0 \ - "ark:gunzip -c $indir/lat.JOB.gz|" "$oldlmcommand" ark:- \| \ - lattice-lmrescore-const-arpa --lm-scale=1.0 \ - ark:- "$newlm" "ark,t:|gzip -c>$outdir/lat.JOB.gz" || exit 1; +if [ $stage -le 1 ]; then + $cmd JOB=1:$nj $outdir/log/rescorelm.JOB.log \ + lattice-lmrescore --lm-scale=-1.0 \ + "ark:gunzip -c $indir/lat.JOB.gz|" "$oldlmcommand" ark:- \| \ + lattice-lmrescore-const-arpa --lm-scale=1.0 \ + ark:- "$newlm" "ark,t:|gzip -c>$outdir/lat.JOB.gz" || exit 1; +fi -if ! $skip_scoring ; then +if ! $skip_scoring && [ $stage -le 2 ]; then err_msg="Not scoring because local/score.sh does not exist or not executable." [ ! -x local/score.sh ] && echo $err_msg && exit 1; - local/score.sh --cmd "$cmd" $data $newlang $outdir + local/score.sh --cmd "$cmd" $scoring_opts $data $newlang $outdir else echo "Not scoring because requested so..." fi diff --git a/egs/wsj/s5/steps/lmrescore_rnnlm_lat.sh b/egs/wsj/s5/steps/lmrescore_rnnlm_lat.sh new file mode 100755 index 00000000000..a669f5bc3d5 --- /dev/null +++ b/egs/wsj/s5/steps/lmrescore_rnnlm_lat.sh @@ -0,0 +1,97 @@ +#!/bin/bash + +# Copyright 2015 Guoguo Chen +# Apache 2.0 + +# This script rescores lattices with RNNLM. See also rnnlmrescore.sh which is +# an older script using n-best lists. + +# Begin configuration section. +cmd=run.pl +skip_scoring=false +max_ngram_order=4 +N=10 +inv_acwt=12 +weight=1.0 # Interpolation weight for RNNLM. +# End configuration section. + +echo "$0 $@" # Print the command line for logging + +. ./utils/parse_options.sh + +if [ $# != 5 ]; then + echo "Does language model rescoring of lattices (remove old LM, add new LM)" + echo "with RNNLM." + echo "" + echo "Usage: $0 [options] \\" + echo " " + echo " e.g.: $0 ./rnnlm data/lang_tg data/test \\" + echo " exp/tri3/test_tg exp/tri3/test_rnnlm" + echo "options: [--cmd (run.pl|queue.pl [queue opts])]" + exit 1; +fi + +[ -f path.sh ] && . ./path.sh; + +oldlang=$1 +rnnlm_dir=$2 +data=$3 +indir=$4 +outdir=$5 + +oldlm=$oldlang/G.fst +if [ -f $oldlang/G.carpa ]; then + oldlm=$oldlang/G.carpa +elif [ ! -f $oldlm ]; then + echo "$0: expecting either $oldlang/G.fst or $oldlang/G.carpa to exist" &&\ + exit 1; +fi + +[ ! -f $oldlm ] && echo "$0: Missing file $oldlm" && exit 1; +[ ! -f $rnnlm_dir/rnnlm ] && echo "$0: Missing file $rnnlm_dir/rnnlm" && exit 1; +[ ! -f $rnnlm_dir/unk.probs ] &&\ + echo "$0: Missing file $rnnlm_dir/unk.probs" && exit 1; +[ ! -f $oldlang/words.txt ] &&\ + echo "$0: Missing file $oldlang/words.txt" && exit 1; +! ls $indir/lat.*.gz >/dev/null &&\ + echo "$0: No lattices input directory $indir" && exit 1; +awk -v n=$0 -v w=$weight 'BEGIN {if (w < 0 || w > 1) { + print n": Interpolation weight should be in the range of [0, 1]"; exit 1;}}' \ + || exit 1; + +oldlm_command="fstproject --project_output=true $oldlm |" + +acwt=`perl -e "print (1.0/$inv_acwt);"` + +mkdir -p $outdir/log +nj=`cat $indir/num_jobs` || exit 1; +cp $indir/num_jobs $outdir + +oldlm_weight=`perl -e "print -1.0 * $weight;"` +if [ "$oldlm" == "$oldlang/G.fst" ]; then + $cmd JOB=1:$nj $outdir/log/rescorelm.JOB.log \ + lattice-lmrescore --lm-scale=$oldlm_weight \ + "ark:gunzip -c $indir/lat.JOB.gz|" "$oldlm_command" ark:- \| \ + lattice-lmrescore-rnnlm --lm-scale=$weight \ + --max-ngram-order=$max_ngram_order ark:$rnnlm_dir/unk.probs \ + $oldlang/words.txt ark:- "$rnnlm_dir/rnnlm" \ + "ark,t:|gzip -c>$outdir/lat.JOB.gz" || exit 1; +else + $cmd JOB=1:$nj $outdir/log/rescorelm.JOB.log \ + lattice-lmrescore-const-arpa --lm-scale=$oldlm_weight \ + "ark:gunzip -c $indir/lat.JOB.gz|" "$oldlm_command" ark:- \| \ + lattice-lmrescore-rnnlm --lm-scale=$weight \ + --max-ngram-order=$max_ngram_order ark:$rnnlm_dir/unk.probs \ + $oldlang/words.txt ark:- "$rnnlm_dir/rnnlm" \ + "ark,t:|gzip -c>$outdir/lat.JOB.gz" || exit 1; +fi + +if ! $skip_scoring ; then + err_msg="Not scoring because local/score.sh does not exist or not executable." + [ ! -x local/score.sh ] && echo $err_msg && exit 1; + local/score.sh --cmd "$cmd" $data $oldlang $outdir +else + echo "Not scoring because requested so..." +fi + +exit 0; diff --git a/egs/wsj/s5/steps/make_denlats.sh b/egs/wsj/s5/steps/make_denlats.sh index 65b4bb8d320..6afecfe5246 100755 --- a/egs/wsj/s5/steps/make_denlats.sh +++ b/egs/wsj/s5/steps/make_denlats.sh @@ -51,6 +51,7 @@ dir=$4 sdata=$data/split$nj splice_opts=`cat $srcdir/splice_opts 2>/dev/null` cmvn_opts=`cat $srcdir/cmvn_opts 2>/dev/null` +delta_opts=`cat $srcdir/delta_opts 2>/dev/null` thread_string= [ $num_threads -gt 1 ] && thread_string="-parallel --num-threads=$num_threads" @@ -87,7 +88,7 @@ if [ -f $srcdir/final.mat ]; then feat_type=lda; else feat_type=delta; fi echo "$0: feature type is $feat_type" case $feat_type in - delta) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";; + delta) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas $delta_opts ark:- ark:- |";; lda) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |" cp $srcdir/final.mat $dir ;; diff --git a/egs/wsj/s5/steps/make_mfcc.sh b/egs/wsj/s5/steps/make_mfcc.sh index 1d152f6cf8d..09c34d40b24 100755 --- a/egs/wsj/s5/steps/make_mfcc.sh +++ b/egs/wsj/s5/steps/make_mfcc.sh @@ -1,4 +1,4 @@ -#!/bin/bash +#!/bin/bash # Copyright 2012 Johns Hopkins University (Author: Daniel Povey) # Apache 2.0 @@ -81,7 +81,7 @@ if [ -f $data/segments ]; then for n in $(seq $nj); do split_segments="$split_segments $logdir/segments.$n" done - + utils/split_scp.pl $data/segments $split_segments || exit 1; rm $logdir/.error 2>/dev/null @@ -127,8 +127,8 @@ done > $data/feats.scp rm $logdir/wav_${name}.*.scp $logdir/segments.* 2>/dev/null -nf=`cat $data/feats.scp | wc -l` -nu=`cat $data/utt2spk | wc -l` +nf=`cat $data/feats.scp | wc -l` +nu=`cat $data/utt2spk | wc -l` if [ $nf -ne $nu ]; then echo "It seems not all of the feature files were successfully processed ($nf != $nu);" echo "consider using utils/fix_data_dir.sh $data" diff --git a/egs/wsj/s5/steps/make_phone_graph.sh b/egs/wsj/s5/steps/make_phone_graph.sh index 4dbb5a8a206..247e5a35d5d 100755 --- a/egs/wsj/s5/steps/make_phone_graph.sh +++ b/egs/wsj/s5/steps/make_phone_graph.sh @@ -4,7 +4,7 @@ # Copyright 2013 Johns Hopkins University (Author: Daniel Povey). Apache 2.0. -# This script makes a phone-based LM, without smoothing to unigram, that +# This script makes a phone-based LM, without smoothing to unigram, that # is to be used for segmentation, and uses that together with a model to # make a decoding graph. # Uses SRILM. @@ -46,7 +46,7 @@ done loc=`which ngram-count`; if [ -z $loc ]; then if uname -a | grep 64 >/dev/null; then # some kind of 64 bit... - sdir=`pwd`/../../../tools/srilm/bin/i686-m64 + sdir=`pwd`/../../../tools/srilm/bin/i686-m64 else sdir=`pwd`/../../../tools/srilm/bin/i686 fi @@ -92,17 +92,14 @@ fi if [ $stage -le 3 ]; then echo "$0: creating G_phones.fst from ARPA" - gunzip -c $dir/phone_graph/arpa_noug.gz | arpa2fst - - | fstprint | \ - utils/eps2disambig.pl | utils/s2eps.pl | \ - awk '{if (NF < 5 || $5 < 100.0) { print; }}' | \ - fstcompile --isymbols=$lang/phones.txt --osymbols=$lang/phones.txt \ - --keep_isymbols=false --keep_osymbols=false | \ - fstconnect | \ - fstrmepsilon > $dir/phone_graph/G_phones.fst - fstisstochastic $dir/phone_graph/G_phones.fst || echo "[info]: G_phones not stochastic." + gunzip -c $dir/phone_graph/arpa_noug.gz | \ + arpa2fst --disambig-symbol=#0 --read-symbol-table=$lang/phones.txt - - | \ + fstprint | awk '{if (NF < 5 || $5 < 100.0) { print; }}' | fstcompile | \ + fstconnect > $dir/phone_graph/G_phones.fst + fstisstochastic $dir/phone_graph/G_phones.fst || echo "[info]: G_phones not stochastic." fi - + if [ $stage -le 4 ]; then echo "$0: creating CLG." @@ -118,7 +115,7 @@ if [ $stage -le 5 ]; then echo "$0: creating Ha.fst" make-h-transducer --disambig-syms-out=$dir/phone_graph/disambig_tid.int \ --transition-scale=$tscale $dir/phone_graph/ilabels_${N}_${P} $dir/tree $dir/final.mdl \ - > $dir/phone_graph/Ha.fst + > $dir/phone_graph/Ha.fst fi if [ $stage -le 6 ]; then @@ -135,7 +132,7 @@ if [ $stage -le 7 ]; then $dir/final.mdl < $dir/phone_graph/HCLGa.fst > $dir/phone_graph/HCLG.fst || exit 1; if [ $tscale == 1.0 -a $loopscale == 1.0 ]; then - # No point doing this test if transition-scale not 1, as it is bound to fail. + # No point doing this test if transition-scale not 1, as it is bound to fail. fstisstochastic $dir/phone_graph/HCLG.fst || echo "[info]: final HCLG is not stochastic." fi diff --git a/egs/wsj/s5/steps/nnet/align.sh b/egs/wsj/s5/steps/nnet/align.sh index eae3f552658..7ba12cdf114 100755 --- a/egs/wsj/s5/steps/nnet/align.sh +++ b/egs/wsj/s5/steps/nnet/align.sh @@ -1,5 +1,5 @@ #!/bin/bash -# Copyright 2012-2013 Brno University of Technology (Author: Karel Vesely) +# Copyright 2012-2015 Brno University of Technology (author: Karel Vesely) # Apache 2.0 # Aligns 'data' to sequences of transition-ids using Neural Network based acoustic model. @@ -14,6 +14,7 @@ scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1" beam=10 retry_beam=40 nnet_forward_opts="--no-softmax=true --prior-scale=1.0" +ivector= # rx-specifier with i-vectors (ark-with-vectors), align_to_lats=false # optionally produce alignment in lattice format lats_decode_opts="--acoustic-scale=0.1 --beam=20 --lattice_beam=10" @@ -27,6 +28,8 @@ use_gpu="no" # yes|no|optionaly [ -f path.sh ] && . ./path.sh # source the path. . parse_options.sh || exit 1; +set -euo pipefail + if [ $# != 4 ]; then echo "usage: $0 " echo "e.g.: $0 data/train data/lang exp/tri1 exp/tri1_ali" @@ -78,6 +81,27 @@ feats="ark,s,cs:copy-feats scp:$sdata/JOB/feats.scp ark:- |" [ ! -z "$cmvn_opts" ] && feats="$feats apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp ark:- ark:- |" # add-deltas (optional), [ ! -z "$delta_opts" ] && feats="$feats add-deltas $delta_opts ark:- ark:- |" +# add-pytel transform (optional), +[ -e $D/pytel_transform.py ] && feats="$feats /bin/env python $D/pytel_transform.py |" + +# add-ivector (optional), +if [ -e $D/ivector_dim ]; then + [ -z $ivector ] && echo "Missing --ivector, they were used in training!" && exit 1 + # Get the tool, + ivector_append_tool=append-vector-to-feats # default, + [ -e $D/ivector_append_tool ] && ivector_append_tool=$(cat $D/ivector_append_tool) + # Check dims, + feats_job_1=$(sed 's:JOB:1:g' <(echo $feats)) + dim_raw=$(feat-to-dim "$feats_job_1" -) + dim_raw_and_ivec=$(feat-to-dim "$feats_job_1 $ivector_append_tool ark:- '$ivector' ark:- |" -) + dim_ivec=$((dim_raw_and_ivec - dim_raw)) + [ $dim_ivec != "$(cat $D/ivector_dim)" ] && \ + echo "Error, i-vector dim. mismatch (expected $(cat $D/ivector_dim), got $dim_ivec in '$ivector')" && \ + exit 1 + # Append to feats, + feats="$feats $ivector_append_tool ark:- '$ivector' ark:- |" +fi + # nnet-forward, feats="$feats nnet-forward $nnet_forward_opts --feature-transform=$feature_transform --class-frame-counts=$class_frame_counts --use-gpu=$use_gpu $nnet ark:- ark:- |" # diff --git a/egs/wsj/s5/steps/nnet/decode.sh b/egs/wsj/s5/steps/nnet/decode.sh index 35065db20e7..49ba466fc26 100755 --- a/egs/wsj/s5/steps/nnet/decode.sh +++ b/egs/wsj/s5/steps/nnet/decode.sh @@ -1,6 +1,6 @@ #!/bin/bash -# Copyright 2012-2013 Karel Vesely, Daniel Povey +# Copyright 2012-2015 Brno University of Technology (author: Karel Vesely), Daniel Povey # Apache 2.0 # Begin configuration section. @@ -9,6 +9,10 @@ feature_transform= # non-default location of feature_transform (optional) model= # non-default location of transition model (optional) class_frame_counts= # non-default location of PDF counts (optional) srcdir= # non-default location of DNN-dir (decouples model dir from decode dir) +ivector= # rx-specifier with i-vectors (ark-with-vectors), + +blocksoftmax_dims= # 'csl' with block-softmax dimensions: dim1,dim2,dim3,... +blocksoftmax_active= # '1' for the 1st block, stage=0 # stage=1 skips lattice generation nj=4 @@ -35,6 +39,8 @@ echo "$0 $@" # Print the command line for logging [ -f ./path.sh ] && . ./path.sh; # source the path. . parse_options.sh || exit 1; +set -euo pipefail + if [ $# != 3 ]; then echo "Usage: $0 [options] " echo "... where is assumed to be a sub-directory of the directory" @@ -109,12 +115,44 @@ feats="ark,s,cs:copy-feats scp:$sdata/JOB/feats.scp ark:- |" [ ! -z "$delta_opts" ] && feats="$feats add-deltas $delta_opts ark:- ark:- |" # add-pytel transform (optional), [ -e $D/pytel_transform.py ] && feats="$feats /bin/env python $D/pytel_transform.py |" -# + +# add-ivector (optional), +if [ -e $D/ivector_dim ]; then + [ -z $ivector ] && echo "Missing --ivector, they were used in training!" && exit 1 + # Get the tool, + ivector_append_tool=append-vector-to-feats # default, + [ -e $D/ivector_append_tool ] && ivector_append_tool=$(cat $D/ivector_append_tool) + # Check dims, + feats_job_1=$(sed 's:JOB:1:g' <(echo $feats)) + dim_raw=$(feat-to-dim "$feats_job_1" -) + dim_raw_and_ivec=$(feat-to-dim "$feats_job_1 $ivector_append_tool ark:- '$ivector' ark:- |" -) + dim_ivec=$((dim_raw_and_ivec - dim_raw)) + [ $dim_ivec != "$(cat $D/ivector_dim)" ] && \ + echo "Error, i-vector dim. mismatch (expected $(cat $D/ivector_dim), got $dim_ivec in '$ivector')" && \ + exit 1 + # Append to feats, + feats="$feats $ivector_append_tool ark:- '$ivector' ark:- |" +fi + +# select a block from blocksoftmax, +if [ ! -z "$blocksoftmax_dims" ]; then + # blocksoftmax_active is a csl! dim1,dim2,dim3,... + [ -z "$blocksoftmax_active" ] && echo "$0 Missing option --blocksoftmax-active N" && exit 1 + # getting dims, + dim_total=$(awk -F'[:,]' '{ for(i=1;i<=NF;i++) { sum += $i }; print sum; }' <(echo $blocksoftmax_dims)) + dim_block=$(awk -F'[:,]' -v active=$blocksoftmax_active '{ print $active; }' <(echo $blocksoftmax_dims)) + offset=$(awk -F'[:,]' -v active=$blocksoftmax_active '{ sum=0; for(i=1;i $dim_total $dim_block $((1+offset)):$((offset+dim_block)) "; + echo " $dim_block $dim_block") $dir/copy_and_softmax.nnet + # nnet is assembled on-the fly, is removed, while + is added, + nnet="nnet-concat 'nnet-copy --remove-last-components=1 $nnet - |' $dir/copy_and_softmax.nnet - |" +fi # Run the decoding in the queue, if [ $stage -le 0 ]; then $cmd --num-threads $((num_threads+1)) JOB=1:$nj $dir/log/decode.JOB.log \ - nnet-forward $nnet_forward_opts --feature-transform=$feature_transform --class-frame-counts=$class_frame_counts --use-gpu=$use_gpu $nnet "$feats" ark:- \| \ + nnet-forward $nnet_forward_opts --feature-transform=$feature_transform --class-frame-counts=$class_frame_counts --use-gpu=$use_gpu "$nnet" "$feats" ark:- \| \ latgen-faster-mapped$thread_string --min-active=$min_active --max-active=$max_active --max-mem=$max_mem --beam=$beam \ --lattice-beam=$lattice_beam --acoustic-scale=$acwt --allow-partial=true --word-symbol-table=$graphdir/words.txt \ $model $graphdir/HCLG.fst ark:- "ark:|gzip -c > $dir/lat.JOB.gz" || exit 1; diff --git a/egs/wsj/s5/steps/nnet/make_bn_feats.sh b/egs/wsj/s5/steps/nnet/make_bn_feats.sh index 1c7b66b02f5..83a2a5fc159 100755 --- a/egs/wsj/s5/steps/nnet/make_bn_feats.sh +++ b/egs/wsj/s5/steps/nnet/make_bn_feats.sh @@ -1,6 +1,6 @@ #!/bin/bash -# Copyright 2012-2014 Brno University of Technology (author: Karel Vesely) +# Copyright 2012-2015 Brno University of Technology (author: Karel Vesely) # Apache 2.0 # To be run from .. (one directory up from here) # see ../run.sh for example @@ -9,8 +9,10 @@ nj=4 cmd=run.pl remove_last_components=4 # remove N last components from the nnet +nnet_forward_opts= use_gpu=no htk_save=false +ivector= # rx-specifier with i-vectors (ark-with-vectors), # End configuration section. echo "$0 $@" # Print the command line for logging @@ -18,6 +20,8 @@ echo "$0 $@" # Print the command line for logging if [ -f path.sh ]; then . ./path.sh; fi . parse_options.sh || exit 1; +set -euo pipefail + if [ $# != 5 ]; then echo "usage: $0 [options] "; echo "options: " @@ -78,12 +82,31 @@ feats="ark,s,cs:copy-feats scp:$sdata/JOB/feats.scp ark:- |" [ ! -z "$cmvn_opts" ] && feats="$feats apply-cmvn $cmvn_opts --utt2spk=ark:$srcdata/utt2spk scp:$srcdata/cmvn.scp ark:- ark:- |" # add-deltas (optional), [ ! -z "$delta_opts" ] && feats="$feats add-deltas $delta_opts ark:- ark:- |" -# +# add-pytel transform (optional), +[ -e $D/pytel_transform.py ] && feats="$feats /bin/env python $D/pytel_transform.py |" + +# add-ivector (optional), +if [ -e $D/ivector_dim ]; then + [ -z $ivector ] && echo "Missing --ivector, they were used in training!" && exit 1 + # Get the tool, + ivector_append_tool=append-vector-to-feats # default, + [ -e $D/ivector_append_tool ] && ivector_append_tool=$(cat $D/ivector_append_tool) + # Check dims, + feats_job_1=$(sed 's:JOB:1:g' <(echo $feats)) + dim_raw=$(feat-to-dim "$feats_job_1" -) + dim_raw_and_ivec=$(feat-to-dim "$feats_job_1 $ivector_append_tool ark:- '$ivector' ark:- |" -) + dim_ivec=$((dim_raw_and_ivec - dim_raw)) + [ $dim_ivec != "$(cat $D/ivector_dim)" ] && \ + echo "Error, i-vector dim. mismatch (expected $(cat $D/ivector_dim), got $dim_ivec in '$ivector')" && \ + exit 1 + # Append to feats, + feats="$feats $ivector_append_tool ark:- '$ivector' ark:- |" +fi if [ $htk_save == false ]; then # Run the forward pass, $cmd JOB=1:$nj $logdir/make_bnfeats.JOB.log \ - nnet-forward --use-gpu=$use_gpu $nnet "$feats" \ + nnet-forward $nnet_forward_opts --use-gpu=$use_gpu $nnet "$feats" \ ark,scp:$bnfeadir/raw_bnfea_$name.JOB.ark,$bnfeadir/raw_bnfea_$name.JOB.scp \ || exit 1; # concatenate the .scp files @@ -101,7 +124,7 @@ else # htk_save == true # Run the forward pass saving HTK features, $cmd JOB=1:$nj $logdir/make_bnfeats_htk.JOB.log \ mkdir -p $data/htkfeats/JOB \; \ - nnet-forward --use-gpu=$use_gpu $nnet "$feats" ark:- \| \ + nnet-forward $nnet_forward_opts --use-gpu=$use_gpu $nnet "$feats" ark:- \| \ copy-feats-to-htk --output-dir=$data/htkfeats/JOB ark:- || exit 1 # Make list of htk features, find $data/htkfeats -name *.fea >$data/htkfeats.scp diff --git a/egs/wsj/s5/steps/nnet/make_denlats.sh b/egs/wsj/s5/steps/nnet/make_denlats.sh index 02d25c744d7..3ad1d248df3 100755 --- a/egs/wsj/s5/steps/nnet/make_denlats.sh +++ b/egs/wsj/s5/steps/nnet/make_denlats.sh @@ -1,5 +1,5 @@ #!/bin/bash -# Copyright 2012-2013 Karel Vesely, Daniel Povey +# Copyright 2012-2013 Brno University of Technology (author: Karel Vesely), Daniel Povey # Apache 2.0. # Create denominator lattices for MMI/MPE/sMBR training. @@ -22,12 +22,15 @@ max_mem=20000000 # This will stop the processes getting too large. # End configuration section. use_gpu=no # yes|no|optional parallel_opts="--num-threads 2" +ivector= # rx-specifier with i-vectors (ark-with-vectors), echo "$0 $@" # Print the command line for logging [ -f ./path.sh ] && . ./path.sh; # source the path. . parse_options.sh || exit 1; +set -euo pipefail + if [ $# != 4 ]; then echo "Usage: steps/$0 [options] " echo " e.g.: steps/$0 data/train data/lang exp/tri1 exp/tri1_denlats" @@ -110,15 +113,35 @@ feats="ark,s,cs:copy-feats scp:$sdata/JOB/feats.scp ark:- |" [ ! -z "$cmvn_opts" ] && feats="$feats apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp ark:- ark:- |" # add-deltas (optional), [ ! -z "$delta_opts" ] && feats="$feats add-deltas $delta_opts ark:- ark:- |" +# add-pytel transform (optional), +[ -e $D/pytel_transform.py ] && feats="$feats /bin/env python $D/pytel_transform.py |" + +# add-ivector (optional), +if [ -e $D/ivector_dim ]; then + [ -z $ivector ] && echo "Missing --ivector, they were used in training!" && exit 1 + # Get the tool, + ivector_append_tool=append-vector-to-feats # default, + [ -e $D/ivector_append_tool ] && ivector_append_tool=$(cat $D/ivector_append_tool) + # Check dims, + feats_job_1=$(sed 's:JOB:1:g' <(echo $feats)) + dim_raw=$(feat-to-dim "$feats_job_1" -) + dim_raw_and_ivec=$(feat-to-dim "$feats_job_1 $ivector_append_tool ark:- '$ivector' ark:- |" -) + dim_ivec=$((dim_raw_and_ivec - dim_raw)) + [ $dim_ivec != "$(cat $D/ivector_dim)" ] && \ + echo "Error, i-vector dim. mismatch (expected $(cat $D/ivector_dim), got $dim_ivec in '$ivector')" && \ + exit 1 + # Append to feats, + feats="$feats $ivector_append_tool ark:- '$ivector' ark:- |" +fi + # nnet-forward, feats="$feats nnet-forward $nnet_forward_opts --feature-transform=$feature_transform --class-frame-counts=$class_frame_counts --use-gpu=$use_gpu $nnet ark:- ark:- |" -# # if this job is interrupted by the user, we want any background jobs to be # killed too. cleanup() { local pids=$(jobs -pr) - [ -n "$pids" ] && kill $pids + [ -n "$pids" ] && kill $pids || true } trap "cleanup" INT QUIT TERM EXIT @@ -140,7 +163,7 @@ else # each job from 1 to $nj is split into multiple pieces (sub-split), and we aim # to have at most two jobs running at each time. The idea is that if we have stragglers # from one job, we can be processing another one at the same time. - rm $dir/.error 2>/dev/null + rm -f $dir/.error prev_pid= for n in `seq $[nj+1]`; do diff --git a/egs/wsj/s5/steps/nnet/make_fmllr_feats.sh b/egs/wsj/s5/steps/nnet/make_fmllr_feats.sh index fd2ab230f47..c9d679004f1 100755 --- a/egs/wsj/s5/steps/nnet/make_fmllr_feats.sh +++ b/egs/wsj/s5/steps/nnet/make_fmllr_feats.sh @@ -1,6 +1,6 @@ #!/bin/bash -# Copyright 2012-2014 Brno University of Technology (Author: Karel Vesely), +# Copyright 2012-2015 Brno University of Technology (author: Karel Vesely), # # Apache 2.0. # @@ -19,19 +19,21 @@ echo "$0 $@" # Print the command line for logging [ -f ./path.sh ] && . ./path.sh; # source the path. . parse_options.sh || exit 1; +set -euo pipefail + if [ $# != 5 ]; then echo "Usage: $0 [options] " echo "e.g.: $0 data-fmllr/train data/train exp/tri5a exp/make_fmllr_feats/log plp/processed/" echo "" - echo "This script works on CMN + (delta+delta-delta | LDA+MLLT) features; it works out" - echo "what type of features you used (assuming it's one of these two)" - echo "You can also use fMLLR features-- you have to supply --transform-dir option." + echo "This script dumps fMLLR features to disk, so it can be used for NN training." + echo "It automoatically figures out the 'feature-type' of the source GMM systems." echo "" echo "main options (for others, see top of script file)" echo " --config # config containing options" + echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs" echo " --nj # number of parallel jobs" - echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs." - echo " --transform-dir # where to find fMLLR transforms." + echo " --transform-dir # dir with fMLLR transforms" + echo " --raw-transform-dir # dir with raw-fMLLR transforms" exit 1; fi @@ -42,9 +44,12 @@ logdir=$4 feadir=$5 sdata=$srcdata/split$nj; -splice_opts=`cat $gmmdir/splice_opts 2>/dev/null` # frame-splicing options. -cmvn_opts=`cat $gmmdir/cmvn_opts 2>/dev/null` -delta_opts=`cat $gmmdir/delta_opts 2>/dev/null` + +# Get the config, +D=$gmmdir +[ -f $D/cmvn_opts ] && cmvn_opts=$(cat $D/cmvn_opts) || cmvn_opts= +[ -f $D/delta_opts ] && delta_opts=$(cat $D/delta_opts) || delta_opts= +[ -f $D/splice_opts ] && splice_opts=$(cat $D/splice_opts) || splice_opts= mkdir -p $data $logdir $feadir [[ -d $sdata && $srcdata/feats.scp -ot $sdata ]] || split_data.sh $srcdata $nj || exit 1; @@ -59,22 +64,17 @@ done echo "$0: Missing $raw_transform_dir/raw_trans.1" && exit 1; # Figure-out the feature-type, -feat_type=delta # Default -[ ! -f $gmmdir/final.mat -a ! -z "$transform_dir" ] && feat_type=delta_fmllr -[ -f $gmmdir/final.mat ] && feat_type=lda -[ -f $gmmdir/final.mat -a ! -z "$transform_dir" ] && feat_type=lda_fmllr +feat_type="[UNKNOWN]" +[ -z "$raw_transform_dir" -a ! -f $gmmdir/final.mat -a ! -z "$transform_dir" ] && feat_type=delta_fmllr +[ -z "$raw_transform_dir" -a -f $gmmdir/final.mat -a ! -z "$transform_dir" ] && feat_type=lda_fmllr [ ! -z "$raw_transform_dir" ] && feat_type=raw_fmllr -[ ! -z "$raw_transform_dir" -a -f $gmmdir/final.mat -a ! -z "$transform_dir" ] && feat_type=raw_fmllr_lda_fmllr echo "$0: feature type is $feat_type"; # Hand-code the feature pipeline, case $feat_type in - delta) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas $delta_opts ark:- ark:- |";; delta_fmllr) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas $delta_opts ark:- ark:- | transform-feats --utt2spk=ark:$sdata/JOB/utt2spk \"ark:cat $transform_dir/trans.* |\" ark:- ark:- |";; - lda) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $gmmdir/final.mat ark:- ark:- |";; lda_fmllr) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $gmmdir/final.mat ark:- ark:- | transform-feats --utt2spk=ark:$sdata/JOB/utt2spk \"ark:cat $transform_dir/trans.* |\" ark:- ark:- |";; raw_fmllr) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark,s,cs:$raw_transform_dir/raw_trans.JOB ark:- ark:- |";; - raw_fmllr_lda_fmllr) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark,s,cs:$cur_trans_dir/raw_trans.JOB ark:- ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $gmmdir/final.mat ark:- ark:- | transform-feats --utt2spk=ark:$sdata/JOB/utt2spk \"ark:cat $transform_dir/trans.* |\" ark:- ark:- |";; *) echo "Invalid feature type $feat_type" && exit 1; esac diff --git a/egs/wsj/s5/steps/nnet/make_fmmi_feats.sh b/egs/wsj/s5/steps/nnet/make_fmmi_feats.sh index a0b28250aa6..2874f00067b 100755 --- a/egs/wsj/s5/steps/nnet/make_fmmi_feats.sh +++ b/egs/wsj/s5/steps/nnet/make_fmmi_feats.sh @@ -1,6 +1,6 @@ #!/bin/bash -# Copyright 2012-2013 Brno University of Technology (Author: Karel Vesely), +# Copyright 2012-2015 Brno University of Technology (author: Karel Vesely), # # Apache 2.0 # @@ -20,6 +20,8 @@ echo "$0 $@" # Print the command line for logging [ -f ./path.sh ] && . ./path.sh; # source the path. . parse_options.sh || exit 1; +set -euo pipefail + if [ $# != 5 ]; then echo "Usage: $0 [options] " echo "e.g.: $0 data-fmmi/train data/train exp/tri5a_fmmi_b0.1 data-fmmi/train/_log data-fmmi/train/_data " @@ -44,8 +46,11 @@ logdir=$4 feadir=$5 sdata=$srcdata/split$nj; -splice_opts=`cat $gmmdir/splice_opts 2>/dev/null` -cmvn_opts=`cat $gmmdir/cmvn_opts 2>/dev/null` + +# Get the config, +D=$gmmdir +[ -f $D/cmvn_opts ] && cmvn_opts=$(cat $D/cmvn_opts) || cmvn_opts= +[ -f $D/splice_opts ] && splice_opts=$(cat $D/splice_opts) || splice_opts= mkdir -p $data $logdir $feadir [[ -d $sdata && $srcdata/feats.scp -ot $sdata ]] || split_data.sh $srcdata $nj || exit 1; diff --git a/egs/wsj/s5/steps/nnet/make_priors.sh b/egs/wsj/s5/steps/nnet/make_priors.sh index f3e9c1edbee..3e7967a1b58 100755 --- a/egs/wsj/s5/steps/nnet/make_priors.sh +++ b/egs/wsj/s5/steps/nnet/make_priors.sh @@ -1,6 +1,6 @@ #!/bin/bash -# Copyright 2012-2014 Brno University of Technology (author: Karel Vesely) +# Copyright 2012-2015 Brno University of Technology (author: Karel Vesely) # Apache 2.0 # To be run from .. (one directory up from here) # see ../run.sh for example @@ -9,6 +9,7 @@ nj=4 cmd=run.pl use_gpu=no +ivector= # End configuration section. echo "$0 $@" # Print the command line for logging @@ -16,6 +17,8 @@ echo "$0 $@" # Print the command line for logging if [ -f path.sh ]; then . ./path.sh; fi . parse_options.sh || exit 1; +set -euo pipefail + if [ $# != 2 ]; then echo "usage: $0 [options] "; echo "options: " @@ -43,6 +46,9 @@ sdata=$data/split$nj echo "Accumulating prior stats by forwarding '$data' with '$nndir'" +# We estimate priors on 10k utterances, selected randomly from the splitted data, +N=$((10000/nj)) + # PREPARE FEATURE EXTRACTION PIPELINE # import config, cmvn_opts= @@ -54,13 +60,32 @@ D=$nndir [ -e $D/delta_opts ] && delta_opts=$(cat $D/delta_opts) # # Create the feature stream, -feats="ark,s,cs:copy-feats scp:$sdata/JOB/feats.scp ark:- |" +feats="ark:cat $sdata/JOB/feats.scp | utils/shuffle_list.pl --srand 777 | head -n$N | copy-feats scp:- ark:- |" # apply-cmvn (optional), [ ! -z "$cmvn_opts" -a ! -f $sdata/1/cmvn.scp ] && echo "$0: Missing $sdata/1/cmvn.scp" && exit 1 [ ! -z "$cmvn_opts" ] && feats="$feats apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp ark:- ark:- |" # add-deltas (optional), [ ! -z "$delta_opts" ] && feats="$feats add-deltas $delta_opts ark:- ark:- |" -# +# add-pytel transform (optional), +[ -e $D/pytel_transform.py ] && feats="$feats /bin/env python $D/pytel_transform.py |" + +# add-ivector (optional), +if [ -e $D/ivector_dim ]; then + [ -z $ivector ] && echo "Missing --ivector, they were used in training!" && exit 1 + # Get the tool, + ivector_append_tool=append-vector-to-feats # default, + [ -e $D/ivector_append_tool ] && ivector_append_tool=$(cat $D/ivector_append_tool) + # Check dims, + feats_job_1=$(sed 's:JOB:1:g' <(echo $feats)) + dim_raw=$(feat-to-dim "$feats_job_1" -) + dim_raw_and_ivec=$(feat-to-dim "$feats_job_1 $ivector_append_tool ark:- '$ivector' ark:- |" -) + dim_ivec=$((dim_raw_and_ivec - dim_raw)) + [ $dim_ivec != "$(cat $D/ivector_dim)" ] && \ + echo "Error, i-vector dim. mismatch (expected $(cat $D/ivector_dim), got $dim_ivec in '$ivector')" && \ + exit 1 + # Append to feats, + feats="$feats $ivector_append_tool ark:- '$ivector' ark:- |" +fi # Run the forward pass, $cmd JOB=1:$nj $nndir/log/prior_stats.JOB.log \ diff --git a/egs/wsj/s5/steps/nnet/pretrain_dbn.sh b/egs/wsj/s5/steps/nnet/pretrain_dbn.sh index c8d9250f420..0895ddf1500 100755 --- a/egs/wsj/s5/steps/nnet/pretrain_dbn.sh +++ b/egs/wsj/s5/steps/nnet/pretrain_dbn.sh @@ -1,5 +1,5 @@ #!/bin/bash -# Copyright 2013-2014 Brno University of Technology (Author: Karel Vesely) +# Copyright 2013-2015 Brno University of Technology (author: Karel Vesely) # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -14,46 +14,54 @@ # See the Apache 2 License for the specific language governing permissions and # limitations under the License. -# To be run from .. +# To be run from ../../ # -# Deep Belief Network pre-training by Contrastive Divergence (CD-1) algorithm. -# The script can pre-train on plain features (ie. saved fMLLR features), -# or modified features (CMN, delta). -# The script creates feature-transform in nnet format, which contains splice -# and shift+scale (zero mean and unit variance on DBN input). +# Restricted Boltzman Machine (RBM) pre-training by Contrastive Divergence +# algorithm (CD-1). A stack of RBMs forms a Deep Belief Neetwork (DBN). +# +# This script by default pre-trains on plain features (ie. saved fMLLR features), +# building a 'feature_transform' containing +/-5 frame splice and global CMVN. +# +# There is also a support for adding speaker-based CMVN, deltas, i-vectors, +# or passing custom 'feature_transform' or its prototype. # -# For special cases it is possible to use external feature-transform. -# # Begin configuration. -# -# nnet config -nn_depth=6 #number of hidden layers -hid_dim=2048 #number of units per layer -param_stddev_first=0.1 #init parameters in 1st RBM -param_stddev=0.1 #init parameters in other RBMs + +# topology, initialization, +nn_depth=6 # number of hidden layers, +hid_dim=2048 # number of neurons per layer, +param_stddev_first=0.1 # init parameters in 1st RBM +param_stddev=0.1 # init parameters in other RBMs input_vis_type=gauss # type of visible nodes on DBN input -# number of iterations -rbm_iter=1 #number of pre-training epochs (Gaussian-Bernoulli RBM has 2x more) -# pre-training opts -rbm_lrate=0.4 #RBM learning rate -rbm_lrate_low=0.01 #lower RBM learning rate (for Gaussian units) -rbm_l2penalty=0.0002 #L2 penalty (increases RBM-mixing rate) + +# number of iterations, +rbm_iter=1 # number of pre-training epochs (Gaussian-Bernoulli RBM has 2x more) + +# pre-training opts, +rbm_lrate=0.4 # RBM learning rate +rbm_lrate_low=0.01 # lower RBM learning rate (for Gaussian units) +rbm_l2penalty=0.0002 # L2 penalty (increases RBM-mixing rate) rbm_extra_opts= -# data processing config -copy_feats=true # resave the features randomized consecutively to tmpdir - copy_feats_tmproot= # tmproot for copy-feats (optional) -# feature config -feature_transform= # Optionally reuse feature processing front-end (override splice,etc.) -feature_transform_proto= # Optionally pass prototype of feature transform -cmvn_opts= # Optionally do CMVN of the input features with options -delta_opts= # Optionally use deltas on the input features -splice=5 # Temporal splicing -splice_step=1 # Stepsize of the splicing (1 is consecutive splice, - # value 2 would do [ -10 -8 -6 -4 -2 0 2 4 6 8 10 ] splicing) + +# data processing, +copy_feats=true # resave the features to tmpdir, +copy_feats_tmproot=/tmp/kaldi.XXXX # sets tmproot for 'copy-feats', + +# feature processing, +splice=5 # (default) splice features both-ways along time axis, +cmvn_opts= # (optional) adds 'apply-cmvn' to input feature pipeline, see opts, +delta_opts= # (optional) adds 'add-deltas' to input feature pipeline, see opts, +ivector= # (optional) adds 'append-vector-to-feats', the option is rx-filename for the 2nd stream, +ivector_append_tool=append-vector-to-feats # (optional) the tool for appending ivectors, + +feature_transform_proto= # (optional) use this prototype for 'feature_transform', +feature_transform= # (optional) directly use this 'feature_transform', + # misc. verbose=1 # enable per-cache reports skip_cuda_check=false + # End configuration. echo "$0 $@" # Print the command line for logging @@ -61,6 +69,7 @@ echo "$0 $@" # Print the command line for logging [ -f path.sh ] && . ./path.sh; . parse_options.sh || exit 1; +set -euo pipefail if [ $# != 2 ]; then echo "Usage: $0 " @@ -71,22 +80,23 @@ if [ $# != 2 ]; then echo " --nn-depth # number of RBM layers" echo " --hid-dim # number of hidden units per layer" echo " --rbm-iter # number of CD-1 iterations per layer" - echo " --dbm-drop-data # probability of frame-dropping," echo " # can be used to subsample large datasets" echo " --rbm-lrate # learning-rate for Bernoulli-Bernoulli RBMs" echo " --rbm-lrate-low # learning-rate for Gaussian-Bernoulli RBM" echo "" - echo " --copy-feats # copy features to /tmp, to accelerate training" - echo " --apply-cmvn # normalize input features (opt.)" - echo " --norm-vars # use variance normalization (opt.)" + echo " --cmvn-opts # add 'apply-cmvn' to input feature pipeline" + echo " --delta-opts # add 'add-deltas' to input feature pipeline" echo " --splice # splice +/-N frames of input features" + echo " --copy-feats # copy features to /tmp, lowers storage stress" + echo "" + echo " --feature_transform_proto # use this prototype for 'feature_transform'" + echo " --feature-transform # directly use this 'feature_transform'" exit 1; fi data=$1 dir=$2 - for f in $data/feats.scp; do [ ! -f $f ] && echo "$0: no such file $f" && exit 1; done @@ -94,7 +104,7 @@ done echo "# INFO" echo "$0 : Pre-training Deep Belief Network as a stack of RBMs" printf "\t dir : $dir \n" -printf "\t Train-set : $data \n" +printf "\t Train-set : $data '$(cat $data/feats.scp | wc -l)'\n" echo [ -e $dir/${nn_depth}.dbn ] && echo "$0 Skipping, already have $dir/${nn_depth}.dbn" && exit 0 @@ -107,104 +117,153 @@ mkdir -p $dir/log ###### PREPARE FEATURES ###### echo echo "# PREPARING FEATURES" -# shuffle the list -echo "Preparing train/cv lists" -cat $data/feats.scp | utils/shuffle_list.pl --srand ${seed:-777} > $dir/train.scp -# print the list size -wc -l $dir/train.scp - -# re-save the shuffled features, so they are stored sequentially on the disk in /tmp/ if [ "$copy_feats" == "true" ]; then - tmpdir=$(mktemp -d $copy_feats_tmproot); mv $dir/train.scp{,_non_local} - copy-feats scp:$dir/train.scp_non_local ark,scp:$tmpdir/train.ark,$dir/train.scp || exit 1 - trap "echo \"Removing features tmpdir $tmpdir @ $(hostname)\"; ls $tmpdir; rm -r $tmpdir" EXIT + # re-save the features to local disk into /tmp/, + tmpdir=$(mktemp -d $copy_feats_tmproot) + trap "echo \"# Removing features tmpdir $tmpdir @ $(hostname)\"; ls $tmpdir; rm -r $tmpdir" INT QUIT TERM EXIT + copy-feats scp:$data/feats.scp ark,scp:$tmpdir/train.ark,$dir/train_sorted.scp || exit 1 +else + # or copy the list, + cp $data/feats.scp $dir/train_sorted.scp fi +# shuffle the list, +utils/shuffle_list.pl --srand 777 <$dir/train_sorted.scp >$dir/train.scp -# create a 10k utt subset for global cmvn estimates +# create a 10k utt subset for global cmvn estimates, head -n 10000 $dir/train.scp > $dir/train.scp.10k +# for debugging, add list with non-local features, +utils/shuffle_list.pl --srand 777 <$data/feats.scp >$dir/train.scp_non_local + ###### OPTIONALLY IMPORT FEATURE SETTINGS ###### +ivector_dim= # no ivectors, if [ ! -z $feature_transform ]; then D=$(dirname $feature_transform) - echo "Importing feature settings from: $transf_dir" + echo "# importing feature settings from dir '$D'" [ -e $D/cmvn_opts ] && cmvn_opts=$(cat $D/cmvn_opts) [ -e $D/delta_opts ] && delta_opts=$(cat $D/delta_opts) - echo "Imported config : cmvn_opts='$cmvn_opts' delta_opts='$delta_opts'" + [ -e $D/ivector_dim ] && ivector_dim=$(cat $D/ivector_dim) + [ -e $D/ivector_append_tool ] && ivector_append_tool=$(cat $D/ivector_append_tool) + echo "# cmvn_opts='$cmvn_opts' delta_opts='$delta_opts' ivector_dim='$ivector_dim'" fi ###### PREPARE FEATURE PIPELINE ###### - # read the features -feats="ark:copy-feats scp:$dir/train.scp ark:- |" +feats_tr="ark:copy-feats scp:$dir/train.scp ark:- |" # optionally add per-speaker CMVN if [ ! -z "$cmvn_opts" ]; then - echo "Will use CMVN statistics : $data/cmvn.scp" + echo "+ 'apply-cmvn' with '$cmvn_opts' using statistics : $data/cmvn.scp" [ ! -r $data/cmvn.scp ] && echo "Missing $data/cmvn.scp" && exit 1; - cmvn="scp:$data/cmvn.scp" - feats="$feats apply-cmvn $cmvn_opts --utt2spk=ark:$data/utt2spk $cmvn ark:- ark:- |" + [ ! -r $data/utt2spk ] && echo "Missing $data/utt2spk" && exit 1; + feats_tr="$feats_tr apply-cmvn $cmvn_opts --utt2spk=ark:$data/utt2spk scp:$data/cmvn.scp ark:- ark:- |" else - echo "apply-cmvn not used" + echo "# 'apply-cmvn' not used," fi # optionally add deltas if [ ! -z "$delta_opts" ]; then - feats="$feats add-deltas $delta_opts ark:- ark:- |" + feats_tr="$feats_tr add-deltas $delta_opts ark:- ark:- |" + echo "# + 'add-deltas' with '$delta_opts'" fi # keep track of the config, -[ ! -z "$cmvn_opts" ] && echo "$cmvn_opts" >$dir/cmvn_opts +[ ! -z "$cmvn_opts" ] && echo "$cmvn_opts" >$dir/cmvn_opts [ ! -z "$delta_opts" ] && echo "$delta_opts" >$dir/delta_opts # +# get feature dim, +feat_dim=$(feat-to-dim "$feats_tr" -) +echo "# feature dim : $feat_dim (input of 'feature_transform')" -# get feature dim -echo -n "Getting feature dim : " -feat_dim=$(feat-to-dim --print-args=false "$feats" -) -echo $feat_dim - - -# Now we will start building feature_transform which will -# be applied in CUDA to gain more speed. +# Now we start building 'feature_transform' which goes right in front of a NN. +# The forwarding is computed on a GPU before the frame shuffling is applied. # -# We will use 1GPU for both feature_transform and MLP training in one binary tool. -# It is necessary, because we need to run it as a single process, using single GPU -# and avoiding I/O overheads. +# Same GPU is used both for 'feature_transform' and the NN training. +# So it has to be done by a single process (we are using exclusive mode). +# This also reduces the CPU-GPU uploads/downloads to minimum. if [ ! -z "$feature_transform" ]; then - echo Using already prepared feature_transform: $feature_transform - cp $feature_transform $dir/final.feature_transform + echo "# importing 'feature_transform' from '$feature_transform'" + tmp=$dir/imported_$(basename $feature_transform) + cp $feature_transform $tmp; feature_transform=$tmp else - if [ ! -z "$feature_transform_proto" ]; then - feature_transform=$dir/tr_$(basename $feature_transform_proto) - log=$dir/log/feature-transform-initialize.log - nnet-initialize --binary=false $feature_transform_proto $feature_transform 2>$log || { cat $log; exit 1; } + # Make default proto with splice, + if [ ! -z $feature_transform_proto ]; then + echo "# importing custom 'feature_transform_proto' from : $feature_transform_proto" else - # Generate the splice transform - echo "Using splice +/- $splice , step $splice_step" - feature_transform=$dir/tr_splice$splice-$splice_step.nnet - utils/nnet/gen_splice.py --fea-dim=$feat_dim --splice=$splice --splice-step=$splice_step > $feature_transform + echo "+ default 'feature_transform_proto' with splice +/-$splice frames" + feature_transform_proto=$dir/splice${splice}.proto + echo " $feat_dim $(((2*splice+1)*feat_dim)) -$splice:$splice " >$feature_transform_proto fi - # Renormalize the MLP input to zero mean and unit variance + # Initialize 'feature-transform' from a prototype, + feature_transform=$dir/tr_$(basename $feature_transform_proto .proto).nnet + nnet-initialize --binary=false $feature_transform_proto $feature_transform + + # Renormalize the MLP input to zero mean and unit variance, feature_transform_old=$feature_transform feature_transform=${feature_transform%.nnet}_cmvn-g.nnet - echo "Renormalizing MLP input features into $feature_transform" - nnet-forward --use-gpu=yes \ - $feature_transform_old "$(echo $feats | sed 's|train.scp|train.scp.10k|')" \ - ark:- 2>$dir/log/cmvn_glob_fwd.log |\ - compute-cmvn-stats ark:- - | cmvn-to-nnet - - |\ - nnet-concat --binary=false $feature_transform_old - $feature_transform - - # MAKE LINK TO THE FINAL feature_transform, so the other scripts will find it ###### - [ -f $dir/final.feature_transform ] && unlink $dir/final.feature_transform - (cd $dir; ln -s $(basename $feature_transform) final.feature_transform ) + echo "# compute normalization stats from 10k sentences" + nnet-forward --print-args=true --use-gpu=yes $feature_transform_old \ + "$(echo $feats_tr | sed 's|train.scp|train.scp.10k|')" ark:- |\ + compute-cmvn-stats ark:- $dir/cmvn-g.stats + echo "# + normalization of NN-input at '$feature_transform'" + nnet-concat --print-args=false --binary=false $feature_transform_old \ + "cmvn-to-nnet $dir/cmvn-g.stats -|" $feature_transform fi +if [ ! -z $ivector ]; then + echo + echo "# ADDING IVECTOR FEATURES" + # The iVectors are concatenated 'as they are' directly to the input of the neural network, + # To do this, we paste the features, and use where the 1st component + # contains the transform and 2nd network contains component. + + echo "# getting dims," + dim_raw=$(feat-to-dim "$feats_tr" -) + dim_raw_and_ivec=$(feat-to-dim "$feats_tr $ivector_append_tool ark:- '$ivector' ark:- |" -) + dim_ivec=$((dim_raw_and_ivec - dim_raw)) + echo "# dims, feats-raw $dim_raw, ivectors $dim_ivec," + + # Should we do something with 'feature_transform'? + if [ ! -z $ivector_dim ]; then + # No, the 'ivector_dim' comes from dir with 'feature_transform' with iVec forwarding, + echo "# assuming we got '$feature_transform' with ivector forwarding," + [ $ivector_dim != $dim_ivec ] && \ + echo -n "Error, i-vector dimensionality mismatch!" && \ + echo " (expected $ivector_dim, got $dim_ivec in $ivector)" && exit 1 + else + # Yes, adjust the transform to do ``iVec forwarding'', + feature_transform_old=$feature_transform + feature_transform=${feature_transform%.nnet}_ivec_copy.nnet + echo "# setting up ivector forwarding into '$feature_transform'," + dim_transformed=$(feat-to-dim "$feats_tr nnet-forward $feature_transform_old ark:- ark:- |" -) + nnet-initialize --print-args=false <(echo " $dim_ivec $dim_ivec 1:$dim_ivec ") $dir/tr_ivec_copy.nnet + nnet-initialize --print-args=false <(echo " $((dim_raw+dim_ivec)) $((dim_transformed+dim_ivec)) $feature_transform_old $dir/tr_ivec_copy.nnet ") $feature_transform + fi + echo $dim_ivec >$dir/ivector_dim # mark down the iVec dim! + echo $ivector_append_tool >$dir/ivector_append_tool + + # pasting the iVecs to the feaures, + echo "# + ivector input '$ivector'" + feats_tr="$feats_tr $ivector_append_tool ark:- '$ivector' ark:- |" +fi + +###### Show the final 'feature_transform' in the log, +echo +echo "### Showing the final 'feature_transform':" +nnet-info $feature_transform +echo "###" + +###### MAKE LINK TO THE FINAL feature_transform, so the other scripts will find it ###### +[ -f $dir/final.feature_transform ] && unlink $dir/final.feature_transform +(cd $dir; ln -s $(basename $feature_transform) final.feature_transform ) +feature_transform=$dir/final.feature_transform ###### GET THE DIMENSIONS ###### -num_fea=$(feat-to-dim --print-args=false "$feats nnet-forward --use-gpu=no $feature_transform ark:- ark:- |" - 2>/dev/null) +num_fea=$(feat-to-dim --print-args=false "$feats_tr nnet-forward --use-gpu=no $feature_transform ark:- ark:- |" - 2>/dev/null) num_hid=$hid_dim @@ -215,61 +274,55 @@ for depth in $(seq 1 $nn_depth); do RBM=$dir/$depth.rbm [ -f $RBM ] && echo "RBM '$RBM' already trained, skipping." && continue - # The first RBM needs special treatment, because of Gussian input nodes + # The first RBM needs special treatment, because of Gussian input nodes, if [ "$depth" == "1" ]; then # This is usually Gaussian-Bernoulli RBM (not if CNN layers are part of input transform) - # initialize - echo "Initializing '$RBM.init'" - echo " - $num_fea $num_hid $input_vis_type bern $param_stddev_first - - " > $RBM.proto + # initialize, + echo "# initializing '$RBM.init'" + echo " $num_fea $num_hid $input_vis_type bern $param_stddev_first" > $RBM.proto nnet-initialize $RBM.proto $RBM.init 2>$dir/log/nnet-initialize.$depth.log || exit 1 - # pre-train - num_iter=$rbm_iter; [ $input_vis_type == "gauss" ] && num_iter=$((2*rbm_iter)) #2x more epochs for Gaussian input + # pre-train, + num_iter=$rbm_iter; [ $input_vis_type == "gauss" ] && num_iter=$((2*rbm_iter)) # 2x more epochs for Gaussian input [ $input_vis_type == "bern" ] && rbm_lrate_low=$rbm_lrate # original lrate for Bernoulli input - echo "Pretraining '$RBM' (input $input_vis_type, lrate $rbm_lrate_low, iters $num_iter)" + echo "# pretraining '$RBM' (input $input_vis_type, lrate $rbm_lrate_low, iters $num_iter)" rbm-train-cd1-frmshuff --learn-rate=$rbm_lrate_low --l2-penalty=$rbm_l2penalty \ --num-iters=$num_iter --verbose=$verbose \ --feature-transform=$feature_transform \ $rbm_extra_opts \ - $RBM.init "$feats" $RBM 2>$dir/log/rbm.$depth.log || exit 1 + $RBM.init "$feats_tr" $RBM 2>$dir/log/rbm.$depth.log || exit 1 else - #This is Bernoulli-Bernoulli RBM - #cmvn stats for init - echo "Computing cmvn stats '$dir/$depth.cmvn' for RBM initialization" - if [ ! -f $dir/$depth.cmvn ]; then - nnet-forward --use-gpu=yes \ - "nnet-concat $feature_transform $dir/$((depth-1)).dbn - |" \ - "$(echo $feats | sed 's|train.scp|train.scp.10k|')" \ - ark:- 2>$dir/log/cmvn_fwd.$depth.log | \ - compute-cmvn-stats ark:- - 2>$dir/log/cmvn.$depth.log | \ - cmvn-to-nnet - $dir/$depth.cmvn || exit 1 + # This is Bernoulli-Bernoulli RBM, + # cmvn stats for init, + echo "# computing cmvn stats '$dir/$depth.cmvn' for RBM initialization" + if [ ! -f $dir/$depth.cmvn ]; then + nnet-forward --print-args=false --use-gpu=yes \ + "nnet-concat $feature_transform $dir/$((depth-1)).dbn - |" \ + "$(echo $feats_tr | sed 's|train.scp|train.scp.10k|')" ark:- | \ + compute-cmvn-stats --print-args=false ark:- - | \ + cmvn-to-nnet --print-args=false - $dir/$depth.cmvn || exit 1 else - echo compute-cmvn-stats already done, skipping. + echo "# compute-cmvn-stats already done, skipping." fi - #initialize - echo "Initializing '$RBM.init'" - echo " - $num_hid $num_hid bern bern $param_stddev $dir/$depth.cmvn - - " > $RBM.proto + # initialize, + echo "initializing '$RBM.init'" + echo " $num_hid $num_hid bern bern $param_stddev $dir/$depth.cmvn" > $RBM.proto nnet-initialize $RBM.proto $RBM.init 2>$dir/log/nnet-initialize.$depth.log || exit 1 - #pre-train - echo "Pretraining '$RBM' (lrate $rbm_lrate, iters $rbm_iter)" + # pre-train, + echo "pretraining '$RBM' (lrate $rbm_lrate, iters $rbm_iter)" rbm-train-cd1-frmshuff --learn-rate=$rbm_lrate --l2-penalty=$rbm_l2penalty \ --num-iters=$rbm_iter --verbose=$verbose \ --feature-transform="nnet-concat $feature_transform $dir/$((depth-1)).dbn - |" \ $rbm_extra_opts \ - $RBM.init "$feats" $RBM 2>$dir/log/rbm.$depth.log || exit 1 + $RBM.init "$feats_tr" $RBM 2>$dir/log/rbm.$depth.log || exit 1 fi - #Create DBN stack + # Create DBN stack, if [ "$depth" == "1" ]; then - rbm-convert-to-nnet --binary=true $RBM $dir/$depth.dbn - else - rbm-convert-to-nnet --binary=true $RBM - | \ - nnet-concat $dir/$((depth-1)).dbn - $dir/$depth.dbn + echo "# converting RBM to $dir/$depth.dbn" + rbm-convert-to-nnet $RBM $dir/$depth.dbn + else + echo "# appending RBM to $dir/$depth.dbn" + nnet-concat $dir/$((depth-1)).dbn "rbm-convert-to-nnet $RBM - |" $dir/$depth.dbn fi done @@ -278,7 +331,7 @@ echo echo "# REPORT" echo "# RBM pre-training progress (line per-layer)" grep progress $dir/log/rbm.*.log -echo +echo echo "Pre-training finished." diff --git a/egs/wsj/s5/steps/nnet/train.sh b/egs/wsj/s5/steps/nnet/train.sh index 1f53c3eb1b7..9f05b34f4d3 100755 --- a/egs/wsj/s5/steps/nnet/train.sh +++ b/egs/wsj/s5/steps/nnet/train.sh @@ -1,92 +1,91 @@ #!/bin/bash -# Copyright 2012/2014 Brno University of Technology (Author: Karel Vesely) +# Copyright 2012-2015 Brno University of Technology (author: Karel Vesely) # Apache 2.0 # Begin configuration. -config= # config, which is also sent to all other scripts - -# NETWORK INITIALIZATION -nnet_init= # select initialized MLP (override initialization) -nnet_proto= # select network prototype (initialize it) -proto_opts= # non-default options for 'make_nnet_proto.py' -feature_transform= # provide feature transform (=splice,rescaling,...) (don't build new one) -pytel_transform= # use external transform defined in python (BUT specific) -network_type=dnn # (dnn,cnn1d,cnn2d,lstm) select type of neural network -cnn_proto_opts= # extra options for 'make_cnn_proto.py' -# -hid_layers=4 # nr. of hidden layers (prior to sotfmax or bottleneck) -hid_dim=1024 # select hidden dimension -bn_dim= # set a value to get a bottleneck network -dbn= # select DBN to prepend to the MLP initialization -# -init_opts= # options, passed to the initialization script - -# FEATURE PROCESSING -copy_feats=true # resave the train/cv features into /tmp (disabled by default) - copy_feats_tmproot= # tmproot for copy-feats (optional) -# feature config (applies always) -cmvn_opts= -delta_opts= -# feature_transform: -splice=5 # temporal splicing -splice_step=1 # stepsize of the splicing (1 == no gap between frames) -feat_type=plain -# feature config (applies to feat_type traps) -traps_dct_basis=11 # nr. od DCT basis (applies to `traps` feat_type, splice10 ) -# feature config (applies to feat_type transf) (ie. LDA+MLLT, no fMLLR) -transf= -splice_after_transf=5 -# feature config (applies to feat_type lda) -lda_dim=300 # LDA dimension (applies to `lda` feat_type) - -# LABELS -labels= # use these labels to train (override deafault pdf alignments, has to be in 'Posterior' format, see ali-to-post) -num_tgt= # force to use number of outputs in the MLP (default is autodetect) - -# TRAINING SCHEDULER -learn_rate=0.008 # initial learning rate -train_opts= # options, passed to the training script -train_tool= # optionally change the training tool -frame_weights= # per-frame weights for gradient weighting - -# OTHER -seed=777 # seed value used for training data shuffling and initialization + +config= # config, also forwarded to 'train_scheduler.sh', + +# topology, initialization, +network_type=dnn # select type of neural network (dnn,cnn1d,cnn2d,lstm), +hid_layers=4 # nr. of hidden layers (before sotfmax or bottleneck), +hid_dim=1024 # number of neurons per layer, +bn_dim= # (optional) adds bottleneck and one more hidden layer to the NN, +dbn= # (optional) prepend layers to the initialized NN, + +proto_opts= # adds options to 'make_nnet_proto.py', +cnn_proto_opts= # adds options to 'make_cnn_proto.py', + +nnet_init= # (optional) use this pre-initialized NN, +nnet_proto= # (optional) use this NN prototype for initialization, + +# feature processing, +splice=5 # (default) splice features both-ways along time axis, +cmvn_opts= # (optional) adds 'apply-cmvn' to input feature pipeline, see opts, +delta_opts= # (optional) adds 'add-deltas' to input feature pipeline, see opts, +ivector= # (optional) adds 'append-vector-to-feats', the option is rx-filename for the 2nd stream, +ivector_append_tool=append-vector-to-feats # (optional) the tool for appending ivectors, + +feat_type=plain +traps_dct_basis=11 # (feat_type=traps) nr. of DCT basis, 11 is good with splice=10, +transf= # (feat_type=transf) import this linear tranform, +splice_after_transf=5 # (feat_type=transf) splice after the linear transform, + +feature_transform_proto= # (optional) use this prototype for 'feature_transform', +feature_transform= # (optional) directly use this 'feature_transform', +pytel_transform= # (BUT) use external python transform, + +# labels, +labels= # (optional) specify non-default training targets, + # (targets need to be in posterior format, see 'ali-to-post', 'feat-to-post'), +num_tgt= # (optional) specifiy number of NN outputs, to be used with 'labels=', + +# training scheduler, +learn_rate=0.008 # initial learning rate, +scheduler_opts= # options, passed to the training scheduler, +train_tool= # optionally change the training tool, +train_tool_opts= # options for the training tool, +frame_weights= # per-frame weights for gradient weighting, +utt_weights= # per-utterance weights (scalar for --frame-weights), + +# data processing, misc. +copy_feats=true # resave the train/cv features into /tmp (disabled by default), +copy_feats_tmproot=/tmp/kaldi.XXXX # sets tmproot for 'copy-feats', +seed=777 # seed value used for data-shuffling, nn-initialization, and training, skip_cuda_check=false + # End configuration. echo "$0 $@" # Print the command line for logging [ -f path.sh ] && . ./path.sh; - - . parse_options.sh || exit 1; +set -euo pipefail if [ $# != 6 ]; then echo "Usage: $0 " echo " e.g.: $0 data/train data/cv data/lang exp/mono_ali_train exp/mono_ali_cv exp/mono_nnet" echo "" echo " Training data : , (for optimizing cross-entropy)" - echo " Held-out data : , (for learn-rate/model selection based on cross-entopy)" + echo " Held-out data : , (for learn-rate scheduling, model selection)" echo " note.: , can point to same directory, or 2 separate directories." echo "" echo "main options (for others, see top of script file)" echo " --config # config containing options" echo "" - echo " --apply-cmvn # apply CMN" - echo " --norm-vars # add CVN if CMN already active" - echo " --splice # concatenate input features" - echo " --feat-type # select type of input features" - echo "" - echo " --mlp-proto # use this NN prototype" + echo " --network-type (dnn,cnn1d,cnn2d,lstm) # type of neural network" + echo " --nnet-proto # use this NN prototype" echo " --feature-transform # re-use this input feature transform" - echo " --hid-layers # number of hidden layers" - echo " --hid-dim # width of hidden layers" - echo " --bn-dim # make bottle-neck network with bn-with N" echo "" + echo " --feat-type (plain|traps|transf) # type of input features" + echo " --cmvn-opts # add 'apply-cmvn' to input feature pipeline" + echo " --delta-opts # add 'add-deltas' to input feature pipeline" + echo " --splice # splice +/-N frames of input features" + echo echo " --learn-rate # initial leaning-rate" - echo " --copy-feats # copy input features to /tmp (it's faster)" + echo " --copy-feats # copy features to /tmp, lowers storage stress" echo "" exit 1; fi @@ -100,7 +99,7 @@ dir=$6 # Using alidir for supervision (default) if [ -z "$labels" ]; then - silphonelist=`cat $lang/phones/silence.csl` || exit 1; + silphonelist=`cat $lang/phones/silence.csl` for f in $alidir/final.mdl $alidir/ali.1.gz $alidir_cv/ali.1.gz; do [ ! -f $f ] && echo "$0: no such file $f" && exit 1; done @@ -114,14 +113,18 @@ echo echo "# INFO" echo "$0 : Training Neural Network" printf "\t dir : $dir \n" -printf "\t Train-set : $data $alidir \n" -printf "\t CV-set : $data_cv $alidir_cv \n" +printf "\t Train-set : $data $(cat $data/feats.scp | wc -l), $alidir \n" +printf "\t CV-set : $data_cv $(cat $data_cv/feats.scp | wc -l) $alidir_cv \n" echo mkdir -p $dir/{log,nnet} -# skip when already trained -[ -e $dir/final.nnet ] && printf "\nSKIPPING TRAINING... ($0)\nnnet already trained : $dir/final.nnet ($(readlink $dir/final.nnet))\n\n" && exit 0 +# skip when already trained, +if [ -e $dir/final.nnet ]; then + echo "SKIPPING TRAINING... ($0)" + echo "nnet already trained : $dir/final.nnet ($(readlink $dir/final.nnet))" + exit 0 +fi # check if CUDA compiled in and GPU is available, if ! $skip_cuda_check; then cuda-gpu-available || exit 1; fi @@ -135,76 +138,90 @@ if [ ! -z "$labels" ]; then labels_cv="$labels" else echo "Using PDF targets from dirs '$alidir' '$alidir_cv'" - # define pdf-alignment rspecifiers + # training targets in posterior format, labels_tr="ark:ali-to-pdf $alidir/final.mdl \"ark:gunzip -c $alidir/ali.*.gz |\" ark:- | ali-to-post ark:- ark:- |" labels_cv="ark:ali-to-pdf $alidir/final.mdl \"ark:gunzip -c $alidir_cv/ali.*.gz |\" ark:- | ali-to-post ark:- ark:- |" - # - labels_tr_pdf="ark:ali-to-pdf $alidir/final.mdl \"ark:gunzip -c $alidir/ali.*.gz |\" ark:- |" # for analyze-counts. + # training targets for analyze-counts, + labels_tr_pdf="ark:ali-to-pdf $alidir/final.mdl \"ark:gunzip -c $alidir/ali.*.gz |\" ark:- |" labels_tr_phn="ark:ali-to-phones --per-frame=true $alidir/final.mdl \"ark:gunzip -c $alidir/ali.*.gz |\" ark:- |" - # get pdf-counts, used later to post-process DNN posteriors - analyze-counts --verbose=1 --binary=false "$labels_tr_pdf" $dir/ali_train_pdf.counts 2>$dir/log/analyze_counts_pdf.log || exit 1 - # copy the old transition model, will be needed by decoder - copy-transition-model --binary=false $alidir/final.mdl $dir/final.mdl || exit 1 + # get pdf-counts, used later for decoding/aligning, + num_pdf=$(hmm-info $alidir/final.mdl | awk '/pdfs/{print $4}') + analyze-counts --verbose=1 --binary=false --counts-dim=$num_pdf \ + ${frame_weights:+ "--frame-weights=$frame_weights"} \ + ${utt_weights:+ "--utt-weights=$utt_weights"} \ + "$labels_tr_pdf" $dir/ali_train_pdf.counts 2>$dir/log/analyze_counts_pdf.log + # copy the old transition model, will be needed by decoder, + copy-transition-model --binary=false $alidir/final.mdl $dir/final.mdl # copy the tree - cp $alidir/tree $dir/tree || exit 1 + cp $alidir/tree $dir/tree - # make phone counts for analysis - [ -e $lang/phones.txt ] && analyze-counts --verbose=1 --symbol-table=$lang/phones.txt "$labels_tr_phn" /dev/null 2>$dir/log/analyze_counts_phones.log || exit 1 + # make phone counts for analysis, + [ -e $lang/phones.txt ] && analyze-counts --verbose=1 --symbol-table=$lang/phones.txt --counts-dim=$num_pdf \ + ${frame_weights:+ "--frame-weights=$frame_weights"} \ + ${utt_weights:+ "--utt-weights=$utt_weights"} \ + "$labels_tr_phn" /dev/null 2>$dir/log/analyze_counts_phones.log fi ###### PREPARE FEATURES ###### echo echo "# PREPARING FEATURES" -# shuffle the list -echo "Preparing train/cv lists :" -cat $data/feats.scp | utils/shuffle_list.pl --srand ${seed:-777} > $dir/train.scp -cp $data_cv/feats.scp $dir/cv.scp -# print the list sizes -wc -l $dir/train.scp $dir/cv.scp - -# re-save the train/cv features to /tmp, reduces LAN traffic, avoids disk-seeks due to shuffled features if [ "$copy_feats" == "true" ]; then - tmpdir=$(mktemp -d $copy_feats_tmproot); mv $dir/train.scp{,_non_local}; mv $dir/cv.scp{,_non_local} - copy-feats scp:$dir/train.scp_non_local ark,scp:$tmpdir/train.ark,$dir/train.scp || exit 1 - copy-feats scp:$dir/cv.scp_non_local ark,scp:$tmpdir/cv.ark,$dir/cv.scp || exit 1 - trap "echo \"Removing features tmpdir $tmpdir @ $(hostname)\"; ls $tmpdir; rm -r $tmpdir" EXIT + echo "# re-saving features to local disk," + tmpdir=$(mktemp -d $copy_feats_tmproot) + copy-feats scp:$data/feats.scp ark,scp:$tmpdir/train.ark,$dir/train_sorted.scp + copy-feats scp:$data_cv/feats.scp ark,scp:$tmpdir/cv.ark,$dir/cv.scp + trap "echo '# Removing features tmpdir $tmpdir @ $(hostname)'; ls $tmpdir; rm -r $tmpdir" EXIT +else + # or copy the list, + cp $data/feats.scp $dir/train_sorted.scp + cp $data_cv/feats.scp $dir/cv.scp fi +# shuffle the list, +utils/shuffle_list.pl --srand ${seed:-777} <$dir/train_sorted.scp >$dir/train.scp -#create a 10k utt subset for global cmvn estimates +# create a 10k utt subset for global cmvn estimates, head -n 10000 $dir/train.scp > $dir/train.scp.10k +# for debugging, add lists with non-local features, +utils/shuffle_list.pl --srand ${seed:-777} <$data/feats.scp >$dir/train.scp_non_local +cp $data_cv/feats.scp $dir/cv.scp_non_local -###### PREPARE FEATURE PIPELINE ###### - -# optionally import feature setup from pre-training, +###### OPTIONALLY IMPORT FEATURE SETTINGS (from pre-training) ###### +ivector_dim= # no ivectors, if [ ! -z $feature_transform ]; then D=$(dirname $feature_transform) + echo "# importing feature settings from dir '$D'" [ -e $D/norm_vars ] && cmvn_opts="--norm-means=true --norm-vars=$(cat $D/norm_vars)" # Bwd-compatibility, [ -e $D/cmvn_opts ] && cmvn_opts=$(cat $D/cmvn_opts) [ -e $D/delta_order ] && delta_opts="--delta-order=$(cat $D/delta_order)" # Bwd-compatibility, [ -e $D/delta_opts ] && delta_opts=$(cat $D/delta_opts) - echo "Imported config : cmvn_opts='$cmvn_opts' delta_opts='$delta_opts'" + [ -e $D/ivector_dim ] && ivector_dim=$(cat $D/ivector_dim) + [ -e $D/ivector_append_tool ] && ivector_append_tool=$(cat $D/ivector_append_tool) + echo "# cmvn_opts='$cmvn_opts' delta_opts='$delta_opts' ivector_dim='$ivector_dim'" fi +###### PREPARE FEATURE PIPELINE ###### # read the features, feats_tr="ark:copy-feats scp:$dir/train.scp ark:- |" feats_cv="ark:copy-feats scp:$dir/cv.scp ark:- |" + # optionally add per-speaker CMVN, if [ ! -z "$cmvn_opts" ]; then - echo "Will use CMVN statistics : $data/cmvn.scp, $data_cv/cmvn.scp" + echo "# + 'apply-cmvn' with '$cmvn_opts' using statistics : $data/cmvn.scp, $data_cv/cmvn.scp" [ ! -r $data/cmvn.scp ] && echo "Missing $data/cmvn.scp" && exit 1; [ ! -r $data_cv/cmvn.scp ] && echo "Missing $data_cv/cmvn.scp" && exit 1; feats_tr="$feats_tr apply-cmvn $cmvn_opts --utt2spk=ark:$data/utt2spk scp:$data/cmvn.scp ark:- ark:- |" feats_cv="$feats_cv apply-cmvn $cmvn_opts --utt2spk=ark:$data_cv/utt2spk scp:$data_cv/cmvn.scp ark:- ark:- |" else - echo "apply-cmvn is not used" + echo "# 'apply-cmvn' is not used," fi + # optionally add deltas, if [ ! -z "$delta_opts" ]; then feats_tr="$feats_tr add-deltas $delta_opts ark:- ark:- |" feats_cv="$feats_cv add-deltas $delta_opts ark:- ark:- |" - echo "add-deltas with $delta_opts" + echo "# + 'add-deltas' with '$delta_opts'" fi # keep track of the config, @@ -219,33 +236,40 @@ if [ ! -z "$pytel_transform" ]; then pytel_transform=$dir/pytel_transform.py feats_tr="$feats_tr /bin/env python $pytel_transform |" feats_cv="$feats_cv /bin/env python $pytel_transform |" + echo "# + 'pytel-transform' from '$pytel_transform'" fi -# get feature dim -echo "Getting feature dim : " -feat_dim=$(feat-to-dim --print-args=false "$feats_tr" -) -echo "Feature dim is : $feat_dim" +# get feature dim, +feat_dim=$(feat-to-dim "$feats_tr" -) +echo "# feature dim : $feat_dim (input of 'feature_transform')" -# Now we will start building complex feature_transform which will -# be forwarded in CUDA to have fast run-time. +# Now we start building 'feature_transform' which goes right in front of a NN. +# The forwarding is computed on a GPU before the frame shuffling is applied. # -# We will use 1GPU for both feature_transform and MLP training in one binary tool. -# This is against the kaldi spirit to have many independent small processing units, -# but it is necessary because of compute exclusive mode, where GPU cannot be shared -# by multiple processes. +# Same GPU is used both for 'feature_transform' and the NN training. +# So it has to be done by a single process (we are using exclusive mode). +# This also reduces the CPU-GPU uploads/downloads to minimum. if [ ! -z "$feature_transform" ]; then - echo "Using pre-computed feature-transform : '$feature_transform'" - tmp=$dir/$(basename $feature_transform) + echo "# importing 'feature_transform' from '$feature_transform'" + tmp=$dir/imported_$(basename $feature_transform) cp $feature_transform $tmp; feature_transform=$tmp else - # Generate the splice transform - echo "Using splice +/- $splice , step $splice_step" - feature_transform=$dir/tr_splice$splice-$splice_step.nnet - utils/nnet/gen_splice.py --fea-dim=$feat_dim --splice=$splice --splice-step=$splice_step > $feature_transform + # Make default proto with splice, + if [ ! -z $feature_transform_proto ]; then + echo "# importing custom 'feature_transform_proto' from '$feature_transform_proto'" + else + echo "# + default 'feature_transform_proto' with splice +/-$splice frames," + feature_transform_proto=$dir/splice${splice}.proto + echo " $feat_dim $(((2*splice+1)*feat_dim)) -$splice:$splice " >$feature_transform_proto + fi + + # Initialize 'feature-transform' from a prototype, + feature_transform=$dir/tr_$(basename $feature_transform_proto .proto).nnet + nnet-initialize --binary=false $feature_transform_proto $feature_transform # Choose further processing of spliced features - echo "Feature type : $feat_type" + echo "# feature type : $feat_type" case $feat_type in plain) ;; @@ -253,14 +277,14 @@ else #generate hamming+dct transform feature_transform_old=$feature_transform feature_transform=${feature_transform%.nnet}_hamm_dct${traps_dct_basis}.nnet - echo "Preparing Hamming DCT transform into : $feature_transform" + echo "# + Hamming DCT transform (t$((splice*2+1)),dct${traps_dct_basis}) into '$feature_transform'" #prepare matrices with time-transposed hamming and dct utils/nnet/gen_hamm_mat.py --fea-dim=$feat_dim --splice=$splice > $dir/hamm.mat utils/nnet/gen_dct_mat.py --fea-dim=$feat_dim --splice=$splice --dct-basis=$traps_dct_basis > $dir/dct.mat #put everything together compose-transforms --binary=false $dir/dct.mat $dir/hamm.mat - | \ transf-to-nnet - - | \ - nnet-concat --binary=false $feature_transform_old - $feature_transform || exit 1 + nnet-concat --binary=false $feature_transform_old - $feature_transform ;; transf) feature_transform_old=$feature_transform @@ -271,131 +295,153 @@ else nnet-concat --binary=false $feature_transform_old \ "transf-to-nnet $transf - |" \ "utils/nnet/gen_splice.py --fea-dim=$feat_dim --splice=$splice_after_transf |" \ - $feature_transform || exit 1 - ;; - lda) - transf=$dir/lda$lda_dim.mat - #get the LDA statistics - if [ ! -r "$dir/lda.acc" ]; then - echo "LDA: Converting alignments to posteriors $dir/lda_post.scp" - ali-to-post "ark:gunzip -c $alidir/ali.*.gz|" ark:- | \ - weight-silence-post 0.0 $silphonelist $alidir/final.mdl ark:- ark,scp:$dir/lda_post.ark,$dir/lda_post.scp 2>$dir/log/ali-to-post-lda.log || exit 1; - echo "Accumulating LDA statistics $dir/lda.acc on top of spliced feats" - acc-lda --rand-prune=4.0 $alidir/final.mdl "$feats_tr nnet-forward $feature_transform ark:- ark:- |" scp:$dir/lda_post.scp $dir/lda.acc 2>$dir/log/acc-lda.log || exit 1; - else - echo "LDA: Using pre-computed stats $dir/lda.acc" - fi - #estimate the transform - echo "Estimating LDA transform $dir/lda.mat from the statistics $dir/lda.acc" - est-lda --write-full-matrix=$dir/lda.full.mat --dim=$lda_dim $transf $dir/lda.acc 2>$dir/log/lda.log || exit 1; - #append the LDA matrix to feature_transform - feature_transform_old=$feature_transform - feature_transform=${feature_transform%.nnet}_lda${lda_dim}.nnet - transf-to-nnet $transf - | \ - nnet-concat --binary=false $feature_transform_old - $feature_transform || exit 1 - #remove the temporary file - rm $dir/lda_post.{ark,scp} + $feature_transform ;; *) echo "Unknown feature type $feat_type" exit 1; ;; esac - # keep track of feat_type + + # keep track of feat_type, echo $feat_type > $dir/feat_type - # Renormalize the MLP input to zero mean and unit variance + # Renormalize the MLP input to zero mean and unit variance, feature_transform_old=$feature_transform feature_transform=${feature_transform%.nnet}_cmvn-g.nnet - echo "Renormalizing MLP input features into $feature_transform" - nnet-forward --use-gpu=yes \ - $feature_transform_old "$(echo $feats_tr | sed 's|train.scp|train.scp.10k|')" \ - ark:- 2>$dir/log/nnet-forward-cmvn.log |\ - compute-cmvn-stats ark:- - | cmvn-to-nnet - - |\ - nnet-concat --binary=false $feature_transform_old - $feature_transform - [ ! -f $feature_transform ] && cat $dir/log/nnet-forward-cmvn.log && echo "Error: Global CMVN failed, was the CUDA GPU okay?" && echo && exit 1 + echo "# compute normalization stats from 10k sentences" + nnet-forward --print-args=true --use-gpu=yes $feature_transform_old \ + "$(echo $feats_tr | sed 's|train.scp|train.scp.10k|')" ark:- |\ + compute-cmvn-stats ark:- $dir/cmvn-g.stats + echo "# + normalization of NN-input at '$feature_transform'" + nnet-concat --binary=false $feature_transform_old \ + "cmvn-to-nnet $dir/cmvn-g.stats -|" $feature_transform fi +if [ ! -z $ivector ]; then + echo + echo "# ADDING IVECTOR FEATURES" + # The iVectors are concatenated 'as they are' directly to the input of the neural network, + # To do this, we paste the features, and use where the 1st component + # contains the transform and 2nd network contains component. + + echo "# getting dims," + dim_raw=$(feat-to-dim "$feats_tr" -) + dim_raw_and_ivec=$(feat-to-dim "$feats_tr $ivector_append_tool ark:- '$ivector' ark:- |" -) + dim_ivec=$((dim_raw_and_ivec - dim_raw)) + echo "# dims, feats-raw $dim_raw, ivectors $dim_ivec," + + # Should we do something with 'feature_transform'? + if [ ! -z $ivector_dim ]; then + # No, the 'ivector_dim' comes from dir with 'feature_transform' with iVec forwarding, + echo "# assuming we got '$feature_transform' with ivector forwarding," + [ $ivector_dim != $dim_ivec ] && \ + echo -n "Error, i-vector dimensionality mismatch!" && \ + echo " (expected $ivector_dim, got $dim_ivec in $ivector)" && exit 1 + else + # Yes, adjust the transform to do ``iVec forwarding'', + feature_transform_old=$feature_transform + feature_transform=${feature_transform%.nnet}_ivec_copy.nnet + echo "# setting up ivector forwarding into '$feature_transform'," + dim_transformed=$(feat-to-dim "$feats_tr nnet-forward $feature_transform_old ark:- ark:- |" -) + nnet-initialize --print-args=false <(echo " $dim_ivec $dim_ivec 1:$dim_ivec ") $dir/tr_ivec_copy.nnet + nnet-initialize --print-args=false <(echo " $((dim_raw+dim_ivec)) $((dim_transformed+dim_ivec)) \ + $feature_transform_old $dir/tr_ivec_copy.nnet ") $feature_transform + fi + echo $dim_ivec >$dir/ivector_dim # mark down the iVec dim! + echo $ivector_append_tool >$dir/ivector_append_tool + + # pasting the iVecs to the feaures, + echo "# + ivector input '$ivector'" + feats_tr="$feats_tr $ivector_append_tool ark:- '$ivector' ark:- |" + feats_cv="$feats_cv $ivector_append_tool ark:- '$ivector' ark:- |" +fi + +###### Show the final 'feature_transform' in the log, +echo +echo "### Showing the final 'feature_transform':" +nnet-info $feature_transform +echo "###" ###### MAKE LINK TO THE FINAL feature_transform, so the other scripts will find it ###### -(cd $dir; [ ! -f final.feature_transform ] && ln -s $(basename $feature_transform) final.feature_transform ) +[ -f $dir/final.feature_transform ] && unlink $dir/final.feature_transform +(cd $dir; ln -s $(basename $feature_transform) final.feature_transform ) +feature_transform=$dir/final.feature_transform ###### INITIALIZE THE NNET ###### echo echo "# NN-INITIALIZATION" -[ ! -z "$nnet_init" ] && echo "Using pre-initialized network '$nnet_init'"; -if [ ! -z "$nnet_proto" ]; then - echo "Initializing using network prototype '$nnet_proto'"; +if [ ! -z $nnet_init ]; then + echo "# using pre-initialized network '$nnet_init'" +elif [ ! -z $nnet_proto ]; then + echo "# initializing NN from prototype '$nnet_proto'"; nnet_init=$dir/nnet.init; log=$dir/log/nnet_initialize.log - nnet-initialize $nnet_proto $nnet_init 2>$log || { cat $log; exit 1; } -fi -if [[ -z "$nnet_init" && -z "$nnet_proto" ]]; then - echo "Getting input/output dims :" - #initializing the MLP, get the i/o dims... - #input-dim - num_fea=$(feat-to-dim "$feats_tr nnet-forward $feature_transform ark:- ark:- |" - ) - { #optioanlly take output dim of DBN - [ ! -z $dbn ] && num_fea=$(nnet-forward "nnet-concat $feature_transform $dbn -|" "$feats_tr" ark:- | feat-to-dim ark:- -) - [ -z "$num_fea" ] && echo "Getting nnet input dimension failed!!" && exit 1 - } - - #output-dim - [ -z $num_tgt ] && num_tgt=$(hmm-info --print-args=false $alidir/final.mdl | grep pdfs | awk '{ print $NF }') - - # make network prototype + nnet-initialize --seed=$seed $nnet_proto $nnet_init +else + echo "# getting input/output dims :" + # input-dim, + get_dim_from=$feature_transform + [ ! -z "$dbn" ] && get_dim_from="nnet-concat $feature_transform '$dbn' -|" + num_fea=$(feat-to-dim "$feats_tr nnet-forward \"$get_dim_from\" ark:- ark:- |" -) + + # output-dim, + [ -z $num_tgt ] && \ + num_tgt=$(hmm-info --print-args=false $alidir/final.mdl | grep pdfs | awk '{ print $NF }') + + # make network prototype, nnet_proto=$dir/nnet.proto - echo "Genrating network prototype $nnet_proto" + echo "# genrating network prototype $nnet_proto" case "$network_type" in dnn) utils/nnet/make_nnet_proto.py $proto_opts \ ${bn_dim:+ --bottleneck-dim=$bn_dim} \ - $num_fea $num_tgt $hid_layers $hid_dim >$nnet_proto || exit 1 + $num_fea $num_tgt $hid_layers $hid_dim >$nnet_proto ;; cnn1d) delta_order=$([ -z $delta_opts ] && echo "0" || { echo $delta_opts | tr ' ' '\n' | grep "delta[-_]order" | sed 's:^.*=::'; }) echo "Debug : $delta_opts, delta_order $delta_order" utils/nnet/make_cnn_proto.py $cnn_proto_opts \ --splice=$splice --delta-order=$delta_order --dir=$dir \ - $num_fea >$nnet_proto || exit 1 + $num_fea >$nnet_proto cnn_fea=$(cat $nnet_proto | grep -v '^$' | tail -n1 | awk '{ print $5; }') utils/nnet/make_nnet_proto.py $proto_opts \ --no-proto-head --no-smaller-input-weights \ ${bn_dim:+ --bottleneck-dim=$bn_dim} \ - "$cnn_fea" $num_tgt $hid_layers $hid_dim >>$nnet_proto || exit 1 + "$cnn_fea" $num_tgt $hid_layers $hid_dim >>$nnet_proto ;; cnn2d) delta_order=$([ -z $delta_opts ] && echo "0" || { echo $delta_opts | tr ' ' '\n' | grep "delta[-_]order" | sed 's:^.*=::'; }) echo "Debug : $delta_opts, delta_order $delta_order" utils/nnet/make_cnn2d_proto.py $cnn_proto_opts \ --splice=$splice --delta-order=$delta_order --dir=$dir \ - $num_fea >$nnet_proto || exit 1 + $num_fea >$nnet_proto cnn_fea=$(cat $nnet_proto | grep -v '^$' | tail -n1 | awk '{ print $5; }') utils/nnet/make_nnet_proto.py $proto_opts \ --no-proto-head --no-smaller-input-weights \ ${bn_dim:+ --bottleneck-dim=$bn_dim} \ - "$cnn_fea" $num_tgt $hid_layers $hid_dim >>$nnet_proto || exit 1 + "$cnn_fea" $num_tgt $hid_layers $hid_dim >>$nnet_proto ;; lstm) utils/nnet/make_lstm_proto.py $proto_opts \ - $num_fea $num_tgt >$nnet_proto || exit 1 + $num_fea $num_tgt >$nnet_proto ;; blstm) utils/nnet/make_blstm_proto.py $proto_opts \ - $num_fea $num_tgt >$nnet_proto || exit 1 + $num_fea $num_tgt >$nnet_proto ;; - *) echo "Unknown : --network_type $network_type" && exit 1; + *) echo "Unknown : --network-type $network_type" && exit 1; esac - # initialize - nnet_init=$dir/nnet.init; log=$dir/log/nnet_initialize.log - echo "Initializing $nnet_proto -> $nnet_init" - nnet-initialize $nnet_proto $nnet_init 2>$log || { cat $log; exit 1; } + # initialize, + nnet_init=$dir/nnet.init + echo "# initializing the NN '$nnet_proto' -> '$nnet_init'" + nnet-initialize --seed=$seed $nnet_proto $nnet_init - # optionally prepend dbn to the initialization - if [ ! -z $dbn ]; then - nnet_init_old=$nnet_init; nnet_init=$dir/nnet_$(basename $dbn)_dnn.init - nnet-concat $dbn $nnet_init_old $nnet_init || exit 1 + # optionally prepend dbn to the initialization, + if [ ! -z "$dbn" ]; then + nnet_init_old=$nnet_init; nnet_init=$dir/nnet_dbn_dnn.init + nnet-concat "$dbn" $nnet_init_old $nnet_init fi fi @@ -404,22 +450,17 @@ fi echo echo "# RUNNING THE NN-TRAINING SCHEDULER" steps/nnet/train_scheduler.sh \ + ${scheduler_opts} \ + ${train_tool:+ --train-tool "$train_tool"} \ + ${train_tool_opts:+ --train-tool-opts "$train_tool_opts"} \ ${feature_transform:+ --feature-transform $feature_transform} \ --learn-rate $learn_rate \ - --randomizer-seed $seed \ - ${train_opts} \ - ${train_tool:+ --train-tool "$train_tool"} \ ${frame_weights:+ --frame-weights "$frame_weights"} \ + ${utt_weights:+ --utt-weights "$utt_weights"} \ ${config:+ --config $config} \ - $nnet_init "$feats_tr" "$feats_cv" "$labels_tr" "$labels_cv" $dir || exit 1 - -if $prepend_cnn; then - echo "Preparing feature transform with CNN layers for RBM pre-training." - nnet-concat $dir/final.feature_transform "nnet-copy --remove-last-layers=$(((hid_layers+1)*2)) $dir/final.nnet - |" \ - $dir/final.feature_transform_cnn 2>$dir/log/concat_transf_cnn.log || exit 1 -fi + $nnet_init "$feats_tr" "$feats_cv" "$labels_tr" "$labels_cv" $dir -echo "$0 successfuly finished.. $dir" +echo "$0: Successfuly finished. '$dir'" sleep 3 exit 0 diff --git a/egs/wsj/s5/steps/nnet/train_mmi.sh b/egs/wsj/s5/steps/nnet/train_mmi.sh index 6e1b42653c7..e2bbfbc6e92 100755 --- a/egs/wsj/s5/steps/nnet/train_mmi.sh +++ b/egs/wsj/s5/steps/nnet/train_mmi.sh @@ -1,5 +1,5 @@ #!/bin/bash -# Copyright 2013 Brno University of Technology (Author: Karel Vesely) +# Copyright 2013-2015 Brno University of Technology (author: Karel Vesely) # Apache 2.0. # Sequence-discriminative MMI/BMMI training of DNN. @@ -21,6 +21,7 @@ learn_rate=0.00001 halving_factor=1.0 #ie. disable halving drop_frames=true verbose=1 +ivector= seed=777 # seed value used for training data shuffling skip_cuda_check=false @@ -31,9 +32,11 @@ echo "$0 $@" # Print the command line for logging [ -f ./path.sh ] && . ./path.sh; # source the path. . parse_options.sh || exit 1; +set -euo pipefail + if [ $# -ne 6 ]; then - echo "Usage: steps/$0 " - echo " e.g.: steps/$0 data/train_all data/lang exp/tri3b_dnn exp/tri3b_dnn_ali exp/tri3b_dnn_denlats exp/tri3b_dnn_mmi" + echo "Usage: $0 " + echo " e.g.: $0 data/train_all data/lang exp/tri3b_dnn exp/tri3b_dnn_ali exp/tri3b_dnn_denlats exp/tri3b_dnn_mmi" echo "Main options (for others, see top of script file)" echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs." echo " --config # config containing options" @@ -54,7 +57,9 @@ alidir=$4 denlatdir=$5 dir=$6 -for f in $data/feats.scp $alidir/{tree,final.mdl,ali.1.gz} $denlatdir/lat.scp $srcdir/{final.nnet,final.feature_transform}; do +for f in $data/feats.scp $denlatdir/lat.scp \ + $alidir/{tree,final.mdl,ali.1.gz} \ + $srcdir/{final.nnet,final.feature_transform}; do [ ! -f $f ] && echo "$0: no such file $f" && exit 1; done @@ -65,7 +70,7 @@ mkdir -p $dir/log cp $alidir/{final.mdl,tree} $dir -silphonelist=`cat $lang/phones/silence.csl` || exit 1; +silphonelist=`cat $lang/phones/silence.csl` #Get the files we will need @@ -91,7 +96,7 @@ model=$dir/final.mdl # Shuffle the feature list to make the GD stochastic! # By shuffling features, we have to use lattices with random access (indexed by .scp file). -cat $data/feats.scp | utils/shuffle_list.pl --srand $seed > $dir/train.scp +cat $data/feats.scp | utils/shuffle_list.pl --srand $seed >$dir/train.scp ### ### PREPARE FEATURE EXTRACTION PIPELINE @@ -112,15 +117,34 @@ feats="ark,o:copy-feats scp:$dir/train.scp ark:- |" [ ! -z "$cmvn_opts" ] && feats="$feats apply-cmvn $cmvn_opts --utt2spk=ark:$data/utt2spk scp:$data/cmvn.scp ark:- ark:- |" # add-deltas (optional), [ ! -z "$delta_opts" ] && feats="$feats add-deltas $delta_opts ark:- ark:- |" -# -# Record the setup, +# add-pytel transform (optional), +[ -e $D/pytel_transform.py ] && feats="$feats /bin/env python $D/pytel_transform.py |" + +# add-ivector (optional), +if [ -e $D/ivector_dim ]; then + [ -z $ivector ] && echo "Missing --ivector, they were used in training!" && exit 1 + # Get the tool, + ivector_append_tool=append-vector-to-feats # default, + [ -e $D/ivector_append_tool ] && ivector_append_tool=$(cat $D/ivector_append_tool) + # Check dims, + dim_raw=$(feat-to-dim "$feats" -) + dim_raw_and_ivec=$(feat-to-dim "$feats $ivector_append_tool ark:- '$ivector' ark:- |" -) + dim_ivec=$((dim_raw_and_ivec - dim_raw)) + [ $dim_ivec != "$(cat $D/ivector_dim)" ] && \ + echo "Error, i-vector dim. mismatch (expected $(cat $D/ivector_dim), got $dim_ivec in '$ivector')" && \ + exit 1 + # Append to feats, + feats="$feats $ivector_append_tool ark:- '$ivector' ark:- |" +fi + +### Record the setup, [ ! -z "$cmvn_opts" ] && echo $cmvn_opts >$dir/cmvn_opts [ ! -z "$delta_opts" ] && echo $delta_opts >$dir/delta_opts -### -### +[ -e $D/pytel_transform.py ] && cp $D/pytel_transform.py $dir/pytel_transform.py +[ -e $D/ivector_dim ] && cp $D/ivector_dim $dir/ivector_dim +[ -e $D/ivector_append_tool ] && cp $D/ivector_append_tool $dir/ivector_append_tool ### - ### ### Prepare the alignments ### @@ -173,7 +197,7 @@ while [ $x -le $num_iters ]; do --learn-rate=$learn_rate \ --drop-frames=$drop_frames \ --verbose=$verbose \ - $cur_mdl $alidir/final.mdl "$feats" "$lats" "$ali" $dir/$x.nnet || exit 1 + $cur_mdl $alidir/final.mdl "$feats" "$lats" "$ali" $dir/$x.nnet fi cur_mdl=$dir/$x.nnet @@ -189,9 +213,15 @@ done echo "MMI/BMMI training finished" -echo "Re-estimating priors by forwarding the training set." -. cmd.sh -nj=$(cat $alidir/num_jobs) -steps/nnet/make_priors.sh --cmd "$train_cmd" --nj $nj $data $dir || exit 1 +if [ -e $dir/prior_counts ]; then + echo "Priors are already re-estimated, skipping... ($dir/prior_counts)" +else + echo "Re-estimating priors by forwarding 10k utterances from training set." + . cmd.sh + nj=$(cat $alidir/num_jobs) + steps/nnet/make_priors.sh --cmd "$train_cmd" --nj $nj \ + ${ivector:+ --ivector "$ivector"} $data $dir +fi +echo "$0: Done. '$dir'" exit 0 diff --git a/egs/wsj/s5/steps/nnet/train_mpe.sh b/egs/wsj/s5/steps/nnet/train_mpe.sh index 6dd77d59edd..6eb107ef04f 100755 --- a/egs/wsj/s5/steps/nnet/train_mpe.sh +++ b/egs/wsj/s5/steps/nnet/train_mpe.sh @@ -1,5 +1,5 @@ #!/bin/bash -# Copyright 2013-2014 Brno University of Technology (Author: Karel Vesely) +# Copyright 2013-2015 Brno University of Technology (author: Karel Vesely) # Apache 2.0. # Sequence-discriminative MPE/sMBR training of DNN. @@ -17,12 +17,15 @@ num_iters=4 acwt=0.1 lmwt=1.0 learn_rate=0.00001 +momentum=0.0 halving_factor=1.0 #ie. disable halving do_smbr=true exclude_silphones=true # exclude silphones from approximate accuracy computation unkphonelist= # exclude unkphones from approximate accuracy computation (overrides exclude_silphones) one_silence_class=true # true : reduce insertions in sMBR/MPE FW/BW, more stable training, + # (all silphones are seen as a single class in the sMBR/MPE FW/BW) verbose=1 +ivector= seed=777 # seed value used for training data shuffling skip_cuda_check=false @@ -33,9 +36,11 @@ echo "$0 $@" # Print the command line for logging [ -f ./path.sh ] && . ./path.sh; # source the path. . parse_options.sh || exit 1; +set -euo pipefail + if [ $# -ne 6 ]; then - echo "Usage: steps/$0 " - echo " e.g.: steps/$0 data/train_all data/lang exp/tri3b_dnn exp/tri3b_dnn_ali exp/tri3b_dnn_denlats exp/tri3b_dnn_smbr" + echo "Usage: $0 " + echo " e.g.: $0 data/train_all data/lang exp/tri3b_dnn exp/tri3b_dnn_ali exp/tri3b_dnn_denlats exp/tri3b_dnn_smbr" echo "Main options (for others, see top of script file)" echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs." echo " --config # config containing options" @@ -55,7 +60,9 @@ alidir=$4 denlatdir=$5 dir=$6 -for f in $data/feats.scp $alidir/{tree,final.mdl,ali.1.gz} $denlatdir/lat.scp $srcdir/{final.nnet,final.feature_transform}; do +for f in $data/feats.scp $denlatdir/lat.scp \ + $alidir/{tree,final.mdl,ali.1.gz} \ + $srcdir/{final.nnet,final.feature_transform}; do [ ! -f $f ] && echo "$0: no such file $f" && exit 1; done @@ -66,7 +73,7 @@ mkdir -p $dir/log cp $alidir/{final.mdl,tree} $dir -silphonelist=`cat $lang/phones/silence.csl` || exit 1; +silphonelist=`cat $lang/phones/silence.csl` #Get the files we will need nnet=$srcdir/$(readlink $srcdir/final.nnet || echo final.nnet); @@ -87,7 +94,9 @@ cp $feature_transform $dir/final.feature_transform model=$dir/final.mdl [ -z "$model" ] && echo "Error transition model '$model' does not exist!" && exit 1; -#enable/disable silphones from MPE training +# The argument '--silence-phones=csl' together with '--one-silence-class=true' +# will cause regrouping of the silenece phones into a single class in the FW/BW +# which calculates the Loss derivative (the 'new' behavior). mpe_silphones_arg= #empty $exclude_silphones && mpe_silphones_arg="--silence-phones=$silphonelist" # all silphones [ ! -z $unkphonelist ] && mpe_silphones_arg="--silence-phones=$unkphonelist" # unk only @@ -116,15 +125,34 @@ feats="ark,o:copy-feats scp:$dir/train.scp ark:- |" [ ! -z "$cmvn_opts" ] && feats="$feats apply-cmvn $cmvn_opts --utt2spk=ark:$data/utt2spk scp:$data/cmvn.scp ark:- ark:- |" # add-deltas (optional), [ ! -z "$delta_opts" ] && feats="$feats add-deltas $delta_opts ark:- ark:- |" -# -# Record the setup, +# add-pytel transform (optional), +[ -e $D/pytel_transform.py ] && feats="$feats /bin/env python $D/pytel_transform.py |" + +# add-ivector (optional), +if [ -e $D/ivector_dim ]; then + [ -z $ivector ] && echo "Missing --ivector, they were used in training!" && exit 1 + # Get the tool, + ivector_append_tool=append-vector-to-feats # default, + [ -e $D/ivector_append_tool ] && ivector_append_tool=$(cat $D/ivector_append_tool) + # Check dims, + dim_raw=$(feat-to-dim "$feats" -) + dim_raw_and_ivec=$(feat-to-dim "$feats $ivector_append_tool ark:- '$ivector' ark:- |" -) + dim_ivec=$((dim_raw_and_ivec - dim_raw)) + [ $dim_ivec != "$(cat $D/ivector_dim)" ] && \ + echo "Error, i-vector dim. mismatch (expected $(cat $D/ivector_dim), got $dim_ivec in '$ivector')" && \ + exit 1 + # Append to feats, + feats="$feats $ivector_append_tool ark:- '$ivector' ark:- |" +fi + +### Record the setup, [ ! -z "$cmvn_opts" ] && echo $cmvn_opts >$dir/cmvn_opts [ ! -z "$delta_opts" ] && echo $delta_opts >$dir/delta_opts -### -### +[ -e $D/pytel_transform.py ] && cp {$D,$dir}/pytel_transform.py +[ -e $D/ivector_dim ] && cp {$D,$dir}/ivector_dim +[ -e $D/ivector_append_tool ] && cp $D/ivector_append_tool $dir/ivector_append_tool ### - ### ### Prepare the alignments ### @@ -155,11 +183,12 @@ while [ $x -le $num_iters ]; do --acoustic-scale=$acwt \ --lm-scale=$lmwt \ --learn-rate=$learn_rate \ + --momentum=$momentum \ --do-smbr=$do_smbr \ --verbose=$verbose \ --one-silence-class=$one_silence_class \ $mpe_silphones_arg \ - $cur_mdl $alidir/final.mdl "$feats" "$lats" "$ali" $dir/$x.nnet || exit 1 + $cur_mdl $alidir/final.mdl "$feats" "$lats" "$ali" $dir/$x.nnet fi cur_mdl=$dir/$x.nnet @@ -176,9 +205,15 @@ done echo "MPE/sMBR training finished" -echo "Re-estimating priors by forwarding the training set." -. cmd.sh -nj=$(cat $alidir/num_jobs) -steps/nnet/make_priors.sh --cmd "$train_cmd" --nj $nj $data $dir || exit 1 +if [ -e $dir/prior_counts ]; then + echo "Priors are already re-estimated, skipping... ($dir/prior_counts)" +else + echo "Re-estimating priors by forwarding 10k utterances from training set." + . cmd.sh + nj=$(cat $alidir/num_jobs) + steps/nnet/make_priors.sh --cmd "$train_cmd" --nj $nj \ + ${ivector:+ --ivector "$ivector"} $data $dir +fi +echo "$0: Done. '$dir'" exit 0 diff --git a/egs/wsj/s5/steps/nnet/train_scheduler.sh b/egs/wsj/s5/steps/nnet/train_scheduler.sh index 4569203e123..59901f5d1d2 100755 --- a/egs/wsj/s5/steps/nnet/train_scheduler.sh +++ b/egs/wsj/s5/steps/nnet/train_scheduler.sh @@ -1,36 +1,35 @@ #!/bin/bash -# Copyright 2012 Karel Vesely (Brno University of Technology) +# Copyright 2012-2015 Brno University of Technology (author: Karel Vesely) # Apache 2.0 -# Train neural network +# Schedules epochs and controls learning rate during the neural network training # Begin configuration. -# training options +# training options, learn_rate=0.008 momentum=0 l1_penalty=0 l2_penalty=0 -# data processing -minibatch_size=256 -randomizer_size=32768 -randomizer_seed=777 + +# data processing, +train_tool="nnet-train-frmshuff" +train_tool_opts="--minibatch-size=256 --randomizer-size=32768 --randomizer-seed=777" feature_transform= -# learn rate scheduling + +# learn rate scheduling, max_iters=20 min_iters=0 # keep training, disable weight rejection, start learn-rate halving as usual, -keep_lr_iters=0 # fix learning rate for N initial epochs, -#start_halving_inc=0.5 -#end_halving_inc=0.1 +keep_lr_iters=0 # fix learning rate for N initial epochs, disable weight rejection, start_halving_impr=0.01 end_halving_impr=0.001 halving_factor=0.5 -# misc. + +# misc, verbose=1 -# tool -train_tool="nnet-train-frmshuff" frame_weights= +utt_weights= # End configuration. @@ -39,6 +38,8 @@ echo "$0 $@" # Print the command line for logging . parse_options.sh || exit 1; +set -euo pipefail + if [ $# != 6 ]; then echo "Usage: $0 " echo " e.g.: $0 0.nnet scp:train.scp scp:cv.scp ark:labels_tr.ark ark:labels_cv.ark exp/dnn1" @@ -62,69 +63,71 @@ dir=$6 [ -e $dir/final.nnet ] && echo "'$dir/final.nnet' exists, skipping training" && exit 0 ############################## -#start training +# start training -# choose mlp to start with +# choose mlp to start with, mlp_best=$mlp_init mlp_base=${mlp_init##*/}; mlp_base=${mlp_base%.*} -# optionally resume training from the best epoch + +# optionally resume training from the best epoch, using saved learning-rate, [ -e $dir/.mlp_best ] && mlp_best=$(cat $dir/.mlp_best) [ -e $dir/.learn_rate ] && learn_rate=$(cat $dir/.learn_rate) -# cross-validation on original network +# cross-validation on original network, log=$dir/log/iter00.initial.log; hostname>$log -$train_tool --cross-validate=true \ - --minibatch-size=$minibatch_size --randomizer-size=$randomizer_size --randomize=false --verbose=$verbose \ - ${feature_transform:+ --feature-transform=$feature_transform} \ - ${frame_weights:+ "--frame-weights=$frame_weights"} \ - "$feats_cv" "$labels_cv" $mlp_best \ - 2>> $log || exit 1; +$train_tool --cross-validate=true --randomize=false --verbose=$verbose $train_tool_opts \ + ${feature_transform:+ --feature-transform=$feature_transform} \ + ${frame_weights:+ "--frame-weights=$frame_weights"} \ + ${utt_weights:+ "--utt-weights=$utt_weights"} \ + "$feats_cv" "$labels_cv" $mlp_best \ + 2>> $log loss=$(cat $dir/log/iter00.initial.log | grep "AvgLoss:" | tail -n 1 | awk '{ print $4; }') loss_type=$(cat $dir/log/iter00.initial.log | grep "AvgLoss:" | tail -n 1 | awk '{ print $5; }') echo "CROSSVAL PRERUN AVG.LOSS $(printf "%.4f" $loss) $loss_type" -# resume lr-halving +# resume lr-halving, halving=0 [ -e $dir/.halving ] && halving=$(cat $dir/.halving) -# training + +# training, for iter in $(seq -w $max_iters); do echo -n "ITERATION $iter: " mlp_next=$dir/nnet/${mlp_base}_iter${iter} - # skip iteration if already done + # skip iteration (epoch) if already done, [ -e $dir/.done_iter$iter ] && echo -n "skipping... " && ls $mlp_next* && continue - # training + # training, log=$dir/log/iter${iter}.tr.log; hostname>$log - $train_tool \ - --learn-rate=$learn_rate --momentum=$momentum --l1-penalty=$l1_penalty --l2-penalty=$l2_penalty \ - --minibatch-size=$minibatch_size --randomizer-size=$randomizer_size --randomize=true --verbose=$verbose \ - --binary=true \ - ${feature_transform:+ --feature-transform=$feature_transform} \ - ${frame_weights:+ "--frame-weights=$frame_weights"} \ - ${randomizer_seed:+ --randomizer-seed=$randomizer_seed} \ - "$feats_tr" "$labels_tr" $mlp_best $mlp_next \ - 2>> $log || exit 1; + $train_tool --cross-validate=false --randomize=true --verbose=$verbose $train_tool_opts \ + --learn-rate=$learn_rate --momentum=$momentum \ + --l1-penalty=$l1_penalty --l2-penalty=$l2_penalty \ + ${feature_transform:+ --feature-transform=$feature_transform} \ + ${frame_weights:+ "--frame-weights=$frame_weights"} \ + ${utt_weights:+ "--utt-weights=$utt_weights"} \ + "$feats_tr" "$labels_tr" $mlp_best $mlp_next \ + 2>> $log || exit 1; tr_loss=$(cat $dir/log/iter${iter}.tr.log | grep "AvgLoss:" | tail -n 1 | awk '{ print $4; }') echo -n "TRAIN AVG.LOSS $(printf "%.4f" $tr_loss), (lrate$(printf "%.6g" $learn_rate)), " - # cross-validation + # cross-validation, log=$dir/log/iter${iter}.cv.log; hostname>$log - $train_tool --cross-validate=true \ - --minibatch-size=$minibatch_size --randomizer-size=$randomizer_size --randomize=false --verbose=$verbose \ - ${feature_transform:+ --feature-transform=$feature_transform} \ - ${frame_weights:+ "--frame-weights=$frame_weights"} \ - "$feats_cv" "$labels_cv" $mlp_next \ - 2>>$log || exit 1; + $train_tool --cross-validate=true --randomize=false --verbose=$verbose $train_tool_opts \ + ${feature_transform:+ --feature-transform=$feature_transform} \ + ${frame_weights:+ "--frame-weights=$frame_weights"} \ + ${utt_weights:+ "--utt-weights=$utt_weights"} \ + "$feats_cv" "$labels_cv" $mlp_next \ + 2>>$log || exit 1; loss_new=$(cat $dir/log/iter${iter}.cv.log | grep "AvgLoss:" | tail -n 1 | awk '{ print $4; }') echo -n "CROSSVAL AVG.LOSS $(printf "%.4f" $loss_new), " - # accept or reject new parameters (based on objective function) + # accept or reject? loss_prev=$loss if [ 1 == $(bc <<< "$loss_new < $loss") -o $iter -le $keep_lr_iters -o $iter -le $min_iters ]; then + # accepting: the loss was better, or we had fixed learn-rate, or we had fixed epoch-number, loss=$loss_new mlp_best=$dir/nnet/${mlp_base}_iter${iter}_learnrate${learn_rate}_tr$(printf "%.4f" $tr_loss)_cv$(printf "%.4f" $loss_new) [ $iter -le $min_iters ] && mlp_best=${mlp_best}_min-iters-$min_iters @@ -133,18 +136,19 @@ for iter in $(seq -w $max_iters); do echo "nnet accepted ($(basename $mlp_best))" echo $mlp_best > $dir/.mlp_best else + # rejecting, mlp_reject=$dir/nnet/${mlp_base}_iter${iter}_learnrate${learn_rate}_tr$(printf "%.4f" $tr_loss)_cv$(printf "%.4f" $loss_new)_rejected mv $mlp_next $mlp_reject echo "nnet rejected ($(basename $mlp_reject))" fi - # create .done file as a mark that iteration is over + # create .done file, the iteration (epoch) is completed, touch $dir/.done_iter$iter - # no learn-rate halving yet, if keep_lr_iters set accordingly + # continue with original learn-rate, [ $iter -le $keep_lr_iters ] && continue - # stopping criterion + # stopping criterion, rel_impr=$(bc <<< "scale=10; ($loss_prev-$loss)/$loss_prev") if [ 1 == $halving -a 1 == $(bc <<< "$rel_impr < $end_halving_impr") ]; then if [ $iter -le $min_iters ]; then @@ -155,30 +159,27 @@ for iter in $(seq -w $max_iters); do break fi - # start annealing when improvement is low + # start learning-rate fade-out when improvement is low, if [ 1 == $(bc <<< "$rel_impr < $start_halving_impr") ]; then halving=1 echo $halving >$dir/.halving fi - # do annealing + # reduce the learning-rate, if [ 1 == $halving ]; then learn_rate=$(awk "BEGIN{print($learn_rate*$halving_factor)}") echo $learn_rate >$dir/.learn_rate fi done -# select the best network +# select the best network, if [ $mlp_best != $mlp_init ]; then mlp_final=${mlp_best}_final_ ( cd $dir/nnet; ln -s $(basename $mlp_best) $(basename $mlp_final); ) ( cd $dir; ln -s nnet/$(basename $mlp_final) final.nnet; ) - echo "Succeeded training the Neural Network : $dir/final.nnet" + echo "$0: Succeeded training the Neural Network : '$dir/final.nnet'" else - "Error training neural network..." + echo "$0: Error training neural network..." exit 1 fi - - - diff --git a/egs/wsj/s5/steps/nnet2/adjust_priors.sh b/egs/wsj/s5/steps/nnet2/adjust_priors.sh new file mode 100755 index 00000000000..3cdcfb4ae73 --- /dev/null +++ b/egs/wsj/s5/steps/nnet2/adjust_priors.sh @@ -0,0 +1,80 @@ +#!/bin/bash + +# Copyright 2012-2014 Johns Hopkins University (Author: Daniel Povey). Apache 2.0. +# Copyright (c) 2015, Johns Hopkins University (Yenda Trmal ) +# License: Apache 2.0 + +# Begin configuration section. +cmd=run.pl +iter=final +# End configuration section + + +echo "$0 $@" # Print the command line for logging + +if [ -f path.sh ]; then . ./path.sh; fi +. parse_options.sh || exit 1; + +if [ $# != 2 ]; then + echo "Usage: $0 [opts] " + echo " e.g.: $0 exp/tri4_mpe_degs exp/tri4_mpe" + echo "" + echo "Performs priors adjustment either on the final iteration" + echo "or iteration of choice of the training. The adjusted model" + echo "filename will be suffixed by \"adj\", i.e. for the final" + echo "iteration final.mdl will become final.adj.mdl" + echo "" + echo "Main options (for others, see top of script file)" + echo " --config # config file containing options" + echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs." + echo " --iter # which iteration to be adjusted" + exit 1; +fi + +degs_dir=$1 +dir=$2 + +src_model=$dir/${iter}.mdl + +if [ ! -f $src_model ]; then + echo "$0: Expecting $src_model to exist." + exit 1 +fi + +if [ ! -f $degs_dir/priors_egs.1.ark ]; then + echo "$0: Expecting $degs_dir/priors_egs.1.ark to exist." + exit 1 +fi + +num_archives_priors=`cat $degs_dir/info/num_archives_priors` || { + echo "Could not find $degs_dir/info/num_archives_priors."; + exit 1; +} + +$cmd JOB=1:$num_archives_priors $dir/log/get_post.${iter}.JOB.log \ + nnet-compute-from-egs "nnet-to-raw-nnet $src_model -|" \ + ark:$degs_dir/priors_egs.JOB.ark ark:- \| \ + matrix-sum-rows ark:- ark:- \| \ + vector-sum ark:- $dir/post.${iter}.JOB.vec || { + echo "Error in getting posteriors for adjusting priors." + echo "See $dir/log/get_post.${iter}.*.log"; + exit 1; + } + + +$cmd $dir/log/sum_post.${iter}.log \ + vector-sum $dir/post.${iter}.*.vec $dir/post.${iter}.vec || { + echo "Error in summing posteriors. See $dir/log/sum_post.${iter}.log"; + exit 1; + } + +rm -f $dir/post.${iter}.*.vec + +echo "Re-adjusting priors based on computed posteriors for iter $iter" +$cmd $dir/log/adjust_priors.${iter}.log \ + nnet-adjust-priors $src_model $dir/post.${iter}.vec $dir/${iter}.adj.mdl || { + echo "Error in adjusting priors. See $dir/log/adjust_priors.${iter}.log"; + exit 1; + } + +echo "Done adjusting priors (on $src_model)" diff --git a/egs/wsj/s5/steps/nnet2/decode.sh b/egs/wsj/s5/steps/nnet2/decode.sh index 753411f4563..7f1c8c2673e 100755 --- a/egs/wsj/s5/steps/nnet2/decode.sh +++ b/egs/wsj/s5/steps/nnet2/decode.sh @@ -68,7 +68,7 @@ done sdata=$data/split$nj; cmvn_opts=`cat $srcdir/cmvn_opts` || exit 1; thread_string= -[ $num_threads -gt 1 ] && thread_string="-parallel --num-threads=$num_threads" +[ $num_threads -gt 1 ] && thread_string="-parallel --num-threads=$num_threads" mkdir -p $dir/log [[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1; @@ -99,7 +99,7 @@ if [ ! -z "$transform_dir" ]; then [ ! -s $transform_dir/num_jobs ] && \ echo "$0: expected $transform_dir/num_jobs to contain the number of jobs." && exit 1; nj_orig=$(cat $transform_dir/num_jobs) - + if [ $feat_type == "raw" ]; then trans=raw_trans; else trans=trans; fi if [ $feat_type == "lda" ] && \ @@ -142,7 +142,7 @@ if [ $stage -le 1 ]; then $graphdir/HCLG.fst "$feats" "ark:|gzip -c > $dir/lat.JOB.gz" || exit 1; fi -# The output of this script is the files "lat.*.gz"-- we'll rescore this at +# The output of this script is the files "lat.*.gz"-- we'll rescore this at # different acoustic scales to get the final output. @@ -151,7 +151,8 @@ if [ $stage -le 2 ]; then [ ! -x local/score.sh ] && \ echo "Not scoring because local/score.sh does not exist or not executable." && exit 1; echo "score best paths" - local/score.sh $scoring_opts --cmd "$cmd" $data $graphdir $dir + [ "$iter" != "final" ] && iter_opt="--iter $iter" + local/score.sh $iter_opt $scoring_opts --cmd "$cmd" $data $graphdir $dir echo "score confidence and timing with sclite" fi fi diff --git a/egs/wsj/s5/steps/nnet2/dump_bottleneck_features.sh b/egs/wsj/s5/steps/nnet2/dump_bottleneck_features.sh index 785a0bf8139..0746a3188a1 100755 --- a/egs/wsj/s5/steps/nnet2/dump_bottleneck_features.sh +++ b/egs/wsj/s5/steps/nnet2/dump_bottleneck_features.sh @@ -36,18 +36,23 @@ nnetdir=$3 archivedir=$4 dir=$5 -# because we [cat trans.*], no need to keep nj consistent with [# of trans] -nj=`cat $transform_dir/num_jobs` || exit 1; - -# Assume that final.mat and final.nnet are at nnetdir -nnet_lda=$nnetdir/final.mat +# Assume that final.nnet is in nnetdir bnf_nnet=$nnetdir/final.raw -for file in $nnet_lda $bnf_nnet; do - if [ ! -f $file ] ; then - echo "No such file $file"; - exit 1; - fi -done +if [ ! -f $bnf_nnet ] ; then + echo "No such file $bnf_nnet"; + exit 1; +fi + +## Set up input features of nnet +if [ -z "$feat_type" ]; then + if [ -f $nnetdir/final.mat ]; then feat_type=lda; else feat_type=delta; fi +fi +echo "$0: feature type is $feat_type" + +if [ "$feat_type" == "lda" ] && [ ! -f $nnetdir/final.mat ]; then + echo "$0: no such file $nnetdir/final.mat" + exit 1 +fi name=`basename $data` sdata=$data/split$nj @@ -55,19 +60,13 @@ sdata=$data/split$nj mkdir -p $dir/log mkdir -p $bnf_data echo $nj > $nnetdir/num_jobs -nnet_plice_opts=`cat $nnetdir/nnet_splice_opts 2>/dev/null` splice_opts=`cat $nnetdir/splice_opts 2>/dev/null` +delta_opts=`cat $nnetdir/delta_opts 2>/dev/null` [[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1; -## Set up input features of nnet -if [ -z "$feat_type" ]; then - if [ -f $nnetdir/final.mat ]; then feat_type=lda; else feat_type=delta; fi -fi -echo "$0: feature type is $feat_type" - case $feat_type in raw) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- |";; - delta) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";; + delta) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas $delta_opts ark:- ark:- |";; lda) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $nnetdir/final.mat ark:- ark:- |" ;; *) echo "Invalid feature type $feat_type" && exit 1; @@ -76,10 +75,16 @@ esac if [ ! -z "$transform_dir" ]; then echo "Using transforms from $transform_dir" [ ! -f $transform_dir/trans.1 ] && echo "No such file $transform_dir/trans.1" && exit 1; -# cat $transform_dir/trans.* > $nnetdir/trans || exit 1; - feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark,s,cs:$transform_dir/trans.JOB ark:- ark:- |" + transform_nj=`cat $transform_dir/num_jobs` || exit 1; + if [ "$nj" != "$transform_nj" ]; then + for n in $(seq $transform_nj); do cat $transform_dir/trans.$n; done >$dir/trans.ark + feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark,s,cs:$dir/trans.ark ark:- ark:- |" + else + feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark,s,cs:$transform_dir/trans.JOB ark:- ark:- |" + fi fi + if [ $stage -le 1 ]; then echo "Making BNF scp and ark." $cmd JOB=1:$nj $dir/log/make_bnf_$name.JOB.log \ @@ -87,23 +92,23 @@ if [ $stage -le 1 ]; then copy-feats --compress=true ark:- ark,scp:$archivedir/raw_bnfeat_$name.JOB.ark,$archivedir/raw_bnfeat_$name.JOB.scp || exit 1; fi -N0=$(cat $data/feats.scp | wc -l) +rm $dir/trans.ark 2>/dev/null + +N0=$(cat $data/feats.scp | wc -l) N1=$(cat $archivedir/raw_bnfeat_$name.*.scp | wc -l) if [[ "$N0" != "$N1" ]]; then echo "Error happens when generating BNF for $name (Original:$N0 BNF:$N1)" exit 1; fi -echo -n >$bnf_data/feats.scp # Concatenate feats.scp into bnf_data -for n in `seq 1 $nj`; do - cat $archivedir/raw_bnfeat_$name.$n.scp >> $bnf_data/feats.scp -done +for n in $(seq $nj); do cat $archivedir/raw_bnfeat_$name.$n.scp; done > $bnf_data/feats.scp for f in segments spk2utt text utt2spk wav.scp char.stm glm kws reco2file_and_channel stm; do [ -e $data/$f ] && cp -r $data/$f $bnf_data/$f done +echo "$0: computing CMVN stats." steps/compute_cmvn_stats.sh $bnf_data $dir $archivedir echo "$0: done making BNF feats.scp." diff --git a/egs/wsj/s5/steps/nnet2/get_egs_discriminative2.sh b/egs/wsj/s5/steps/nnet2/get_egs_discriminative2.sh index c932e0463cc..4c08a08b824 100755 --- a/egs/wsj/s5/steps/nnet2/get_egs_discriminative2.sh +++ b/egs/wsj/s5/steps/nnet2/get_egs_discriminative2.sh @@ -81,29 +81,30 @@ nj=$(cat $denlatdir/num_jobs) || exit 1; # $nj is the number of # splits of the denlats and alignments. +[ "$(readlink /bin/sh)" == dash ] && \ + echo "This script won't work if /bin/sh points to dash. make it point to bash." && exit 1 + nj_ali=$(cat $alidir/num_jobs) || exit 1; sdata=$data/split$nj utils/split_data.sh $data $nj - - - if [ $nj_ali -eq $nj ]; then ali_rspecifier="ark,s,cs:gunzip -c $alidir/ali.JOB.gz |" - prior_ali_rspecifier="ark,s,cs:gunzip -c $alidir/ali.JOB.gz | copy-int-vector ark:- ark,t:- | utils/filter_scp.pl $dir/priors_uttlist | ali-to-pdf $alidir/final.mdl ark,t:- ark:- |" + all_ids=$(seq -s, $nj) + prior_ali_rspecifier="ark,s,cs:gunzip -c $alidir/ali.{$all_ids}.gz | copy-int-vector ark:- ark,t:- | utils/filter_scp.pl $dir/priors_uttlist | ali-to-pdf $alidir/final.mdl ark,t:- ark:- |" else ali_rspecifier="scp:$dir/ali.scp" prior_ali_rspecifier="ark,s,cs:utils/filter_scp.pl $dir/priors_uttlist $dir/ali.scp | ali-to-pdf $alidir/final.mdl scp:- ark:- |" if [ $stage -le 1 ]; then echo "$0: number of jobs in den-lats versus alignments differ: dumping them as single archive and index." all_ids=$(seq -s, $nj_ali) - copy-int-vector --print-args=false \ - "ark:gunzip -c $alidir/ali.{$all_ids}.gz|" ark,scp:$dir/ali.ark,$dir/ali.scp || exit 1; + $cmd $dir/log/copy_alignments.log \ + copy-int-vector "ark:gunzip -c $alidir/ali.{$all_ids}.gz|" \ + ark,scp:$dir/ali.ark,$dir/ali.scp || exit 1; fi fi - splice_opts=`cat $alidir/splice_opts 2>/dev/null` silphonelist=`cat $lang/phones/silence.csl` || exit 1; cmvn_opts=`cat $alidir/cmvn_opts 2>/dev/null` @@ -124,7 +125,7 @@ else echo 0 > $dir/info/ivector_dim fi -# Get list of validation utterances. +# Get list of validation utterances. awk '{print $1}' $data/utt2spk | utils/shuffle_list.pl | head -$num_utts_subset \ > $dir/priors_uttlist || exit 1; @@ -137,13 +138,13 @@ echo "$0: feature type is $feat_type" case $feat_type in raw) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- |" - priors_feats="ark,s,cs:utils/filter_scp.pl $dir/priors_uttlist $sdata/JOB/feats.scp | apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$data/cmvn.scp scp:- ark:- |" + priors_feats="ark,s,cs:utils/filter_scp.pl $dir/priors_uttlist $data/feats.scp | apply-cmvn $cmvn_opts --utt2spk=ark:$data/utt2spk scp:$data/cmvn.scp scp:- ark:- |" ;; - lda) + lda) splice_opts=`cat $alidir/splice_opts 2>/dev/null` - cp $alidir/final.mat $dir + cp $alidir/final.mat $dir feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $dir/final.mat ark:- ark:- |" - priors_feats="ark,s,cs:utils/filter_scp.pl $dir/priors_uttlist $sdata/JOB/feats.scp | apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$data/cmvn.scp scp:- ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $dir/final.mat ark:- ark:- |" + priors_feats="ark,s,cs:utils/filter_scp.pl $dir/priors_uttlist $data/feats.scp | apply-cmvn $cmvn_opts --utt2spk=ark:$data/utt2spk scp:$data/cmvn.scp scp:- ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $dir/final.mat ark:- ark:- |" ;; *) echo "$0: invalid feature type $feat_type" && exit 1; esac @@ -159,7 +160,7 @@ if [ ! -z "$transform_dir" ]; then [ ! -s $transform_dir/num_jobs ] && \ echo "$0: expected $transform_dir/num_jobs to contain the number of jobs." && exit 1; nj_orig=$(cat $transform_dir/num_jobs) - + if [ $feat_type == "raw" ]; then trans=raw_trans; else trans=trans; fi if [ $feat_type == "lda" ] && ! cmp $transform_dir/final.mat $alidir/final.mat; then @@ -173,19 +174,20 @@ if [ ! -z "$transform_dir" ]; then if [ $nj -ne $nj_orig ]; then # Copy the transforms into an archive with an index. for n in $(seq $nj_orig); do cat $transform_dir/$trans.$n; done | \ - copy-feats ark:- ark,scp:$dir/$trans.ark,$dir/$trans.scp || exit 1; + copy-feats ark:- ark,scp:$dir/$trans.ark,$dir/$trans.scp || exit 1; feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk scp:$dir/$trans.scp ark:- ark:- |" - priors_feats="$priors_feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk scp:$dir/$trans.scp ark:- ark:- |" + priors_feats="$priors_feats transform-feats --utt2spk=ark:$data/utt2spk scp:$dir/$trans.scp ark:- ark:- |" else # number of jobs matches with alignment dir. feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$transform_dir/$trans.JOB ark:- ark:- |" - priors_feats="$priors_feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$transform_dir/$trans.JOB ark:- ark:- |" + all_ids=`seq -s, $nj` + priors_feats="$priors_feats transform-feats --utt2spk=ark:$data/utt2spk 'ark:cat $transform_dir/$trans.{$all_ids} |' ark:- ark:- |" fi fi if [ ! -z $online_ivector_dir ]; then # add iVectors to the features. feats="$feats paste-feats --length-tolerance=$ivector_period ark:- 'ark,s,cs:utils/filter_scp.pl $sdata/JOB/utt2spk $online_ivector_dir/ivector_online.scp | subsample-feats --n=-$ivector_period scp:- ark:- |' ark:- |" - priors_feats="$priors_feats paste-feats --length-tolerance=$ivector_period ark:- 'ark,s,cs:utils/filter_scp.pl $sdata/JOB/utt2spk $online_ivector_dir/ivector_online.scp | subsample-feats --n=-$ivector_period scp:- ark:- |' ark:- |" + priors_feats="$priors_feats paste-feats --length-tolerance=$ivector_period ark:- 'ark,s,cs:utils/filter_scp.pl $dir/priors_uttlist $online_ivector_dir/ivector_online.scp | subsample-feats --n=-$ivector_period scp:- ark:- |' ark:- |" fi @@ -193,7 +195,7 @@ if [ $stage -le 2 ]; then echo "$0: working out number of frames of training data" num_frames=$(steps/nnet2/get_num_frames.sh $data) - echo $num_frames > $dir/info/num_frames + echo $num_frames > $dir/info/num_frames # Working out total number of archives. Add one on the assumption the # num-frames won't divide exactly, and we want to round up. @@ -210,7 +212,7 @@ if [ $stage -le 2 ]; then echo $num_archives >$dir/info/num_archives || exit 1 echo $num_archives_temp >$dir/info/num_archives_temp || exit 1 - + frames_per_archive=$[$num_frames/$num_archives] # note, this is the number of frames per archive prior to discarding frames. @@ -256,38 +258,22 @@ if [ $stage -le 10 ]; then priors_egs_list= for y in `seq $num_archives_priors`; do utils/create_data_link.pl $dir/priors_egs.$y.ark - for x in `seq $nj`; do - utils/create_data_link.pl $dir/priors_egs_orig.$x.$y.ark - done - priors_egs_list="$priors_egs_list ark:$dir/priors_egs_orig.JOB.$y.ark" + priors_egs_list="$priors_egs_list ark:$dir/priors_egs.$y.ark" done - + nnet_context_opts="--left-context=$left_context --right-context=$right_context" echo "$0: dumping egs for prior adjustment in the background." -$cmd JOB=1:$nj $dir/log/create_priors_subset.JOB.log \ +$cmd $dir/log/create_priors_subset.log \ nnet-get-egs $ivectors_opt $nnet_context_opts "$priors_feats" \ "$prior_ali_rspecifier ali-to-post ark:- ark:- |" \ ark:- \| nnet-copy-egs ark:- $priors_egs_list || \ - { touch $dir/.error; echo "Error in creating priors subset. See $dir/log/create_priors_subset.*.log"; exit 1; } + { touch $dir/.error; echo "Error in creating priors subset. See $dir/log/create_priors_subset.log"; exit 1; } sleep 3; -echo "$0: recombining archives on disk" -# combine all the "priors_egs_orig.JOB.*.scp" (over the $nj splits of the data) and -# writing to the priors_egs.JOB.ark - -priors_egs_list= -for n in $(seq $nj); do - priors_egs_list="$priors_egs_list $dir/priors_egs_orig.$n.JOB.ark" -done - -echo $num_archives_priors >$dir/info/num_archives_priors - -$cmd JOB=1:$num_archives_priors $dir/log/copy_priors_egs.JOB.log \ - nnet-copy-egs "ark:cat $priors_egs_list|" ark:$dir/priors_egs.JOB.ark || \ - { touch $dir/.error; echo "Error in creating priors_egs. See $dir/log/copy_priors_egs.*.log"; exit 1; } +echo $num_archives_priors >$dir/info/num_archives_priors fi @@ -306,12 +292,12 @@ if [ $stage -le 3 ]; then fi if [ $stage -le 4 ]; then - + degs_list=$(for n in $(seq $nj); do echo $dir/degs_orig.$n.JOB.ark; done) if [ $num_archives -eq $num_archives_temp ]; then echo "$0: combining data into final archives and shuffling it" - + $cmd JOB=1:$num_archives $dir/log/shuffle.JOB.log \ cat $degs_list \| nnet-shuffle-egs-discriminative --srand=JOB ark:- \ ark:$dir/degs.JOB.ark || exit 1; @@ -354,10 +340,6 @@ if $cleanup; then file=$dir/degs_orig.$x.$y.ark [ -L $file ] && rm $(readlink -f $file); rm $file done - for y in $(seq $num_archives_priors); do - file=$dir/priors_egs_orig.$x.$y.ark - [ -L $file ] && rm $(readlink -f $file); rm $file - done done if [ $num_archives_temp -ne $num_archives ]; then for z in $(seq $num_archives); do diff --git a/egs/wsj/s5/steps/nnet2/get_lda_block.sh b/egs/wsj/s5/steps/nnet2/get_lda_block.sh index c840e014250..7bd4ecf5647 100755 --- a/egs/wsj/s5/steps/nnet2/get_lda_block.sh +++ b/egs/wsj/s5/steps/nnet2/get_lda_block.sh @@ -104,7 +104,7 @@ while [ $[$cur_index+$block_size] -le $feat_dim ]; do echo >> $dir/indexes num_blocks=$[$num_blocks+1] cur_index=$[$cur_index+$block_shift] - if [ $[$cur_index+$block_size-1] -gt $feat_dim ]; then + if [ $[$cur_index+$block_size] -gt $feat_dim ]; then cur_index=$[$feat_dim-$block_size]; fi done diff --git a/egs/wsj/s5/steps/nnet2/get_num_frames.sh b/egs/wsj/s5/steps/nnet2/get_num_frames.sh deleted file mode 100755 index a960e2fcfe9..00000000000 --- a/egs/wsj/s5/steps/nnet2/get_num_frames.sh +++ /dev/null @@ -1,44 +0,0 @@ -#!/bin/bash - -# This script works out the approximate number of frames in a training directory -# this is sometimes needed by higher-level scripts - -num_samples=1000 - - -if [ -f path.sh ]; then . ./path.sh; fi -. parse_options.sh || exit 1; - -if [ $# -ne 1 ]; then - ( - echo "Usage: $0 " - echo "Prints the number of frames of data in the data-dir, via sampling rather" - echo "than trying to access all the data." - ) 1>&2 -fi - -data=$1 - -if [ ! -f $data/feats.scp ]; then - if [ -f $data/segments ]; then - echo "$0: $data/feats.scp does not exist, but $data/segments does exist; using that and assuming 100 frames per second." 1>&2 - num_frames=$(cat $data/segments | awk '{x += $4 - $3;} END{print int(x*100);}') || exit 1; - echo $num_frames - exit 0; - else - echo "$0: neither $data/feats.scp nor $data/segments exist." 1>&2 - exit 1; - fi -fi - - -sample_frames=$(utils/shuffle_list.pl $data/feats.scp | head -n $num_samples | sort | feat-to-len --print-args=false scp:-) - -num_files_orig=$(wc -l <$data/feats.scp) -if [ $num_samples -lt $num_files_orig ]; then - num_files_sampled=$num_samples -else - num_files_sampled=$num_files_orig -fi - -perl -e "\$n = int(($sample_frames * 1.0 * $num_files_orig) / (1.0 * $num_files_sampled)); print \"\$n\n\";"; diff --git a/egs/wsj/s5/steps/nnet2/get_num_frames.sh b/egs/wsj/s5/steps/nnet2/get_num_frames.sh new file mode 120000 index 00000000000..d5eab6ede07 --- /dev/null +++ b/egs/wsj/s5/steps/nnet2/get_num_frames.sh @@ -0,0 +1 @@ +../../utils/data/get_num_frames.sh \ No newline at end of file diff --git a/egs/wsj/s5/steps/nnet2/remove_egs.sh b/egs/wsj/s5/steps/nnet2/remove_egs.sh index da0484e954a..143a5d0d86a 100755 --- a/egs/wsj/s5/steps/nnet2/remove_egs.sh +++ b/egs/wsj/s5/steps/nnet2/remove_egs.sh @@ -34,10 +34,8 @@ if [ -f $egs/.nodelete ]; then fi -flist=$egs/egs.*.ark - -for f in $egs/egs.*.ark $egs/degs.*.ark; do +for f in $egs/egs.*.ark $egs/degs.*.ark $egs/cegs.*.ark; do if [ -L $f ]; then rm $(dirname $f)/$(readlink $f) # this will print a warning if it fails. fi diff --git a/egs/wsj/s5/steps/nnet2/retrain_simple2.sh b/egs/wsj/s5/steps/nnet2/retrain_simple2.sh index d3f5223b59d..9e018015075 100755 --- a/egs/wsj/s5/steps/nnet2/retrain_simple2.sh +++ b/egs/wsj/s5/steps/nnet2/retrain_simple2.sh @@ -355,7 +355,7 @@ while [ $x -lt $num_iters ]; do $cmd $parallel_opts $dir/log/train.$x.$n.log \ nnet-train$parallel_suffix $parallel_train_opts \ --minibatch-size=$this_minibatch_size --srand=$x $dir/$x.mdl \ - "ark:nnet-copy-egs --frame=$frame ark:$cur_egs_dir/egs.$archive.ark ark:-|nnet-shuffle-egs --buffer-size=$shuffle_buffer_size --srand=$x ark:- ark:-|" \ + "ark,bg:nnet-copy-egs --frame=$frame ark:$cur_egs_dir/egs.$archive.ark ark:-|nnet-shuffle-egs --buffer-size=$shuffle_buffer_size --srand=$x ark:- ark:-|" \ $dir/$[$x+1].$n.mdl || touch $dir/.error & done wait diff --git a/egs/wsj/s5/steps/nnet2/train_convnet_accel2.sh b/egs/wsj/s5/steps/nnet2/train_convnet_accel2.sh index 1c34749ba7f..a99075f2aef 100755 --- a/egs/wsj/s5/steps/nnet2/train_convnet_accel2.sh +++ b/egs/wsj/s5/steps/nnet2/train_convnet_accel2.sh @@ -4,14 +4,16 @@ # 2013 Xiaohui Zhang # 2013 Guoguo Chen # 2014 Vimal Manohar +# 2015 Xingyu Na # Apache 2.0. # train_convnet_accel2.sh is modified from train_pnorm_accel2.sh. It propotypes -# the training of a ConvNet. The ConvNet is composed of 4 layers. The first layer +# the training of a ConvNet. The ConvNet is composed of 4 hidden layers. The first layer # is a Convolutional1d component plus a Maxpooling component. The second layer # is a single Convolutional1d component. The third and fourth layers are affine # components with ReLU nonlinearities. Due to non-squashing output, normalize -# component is applied to all four layers. +# component is applied to all four layers. The number of hidden layers is hard +# coded now. # train_pnorm_accel2.sh is a modified form of train_pnorm_simple2.sh (the "2" # suffix is because they both use the the "new" egs format, created by @@ -63,7 +65,7 @@ shuffle_buffer_size=5000 # This "buffer_size" variable controls randomization of # (the point of this is to get data in different minibatches on different iterations, # since in the preconditioning method, 2 samples in the same minibatch can # affect each others' gradients. - +num_hidden_layers=4 add_layers_period=2 # by default, add new layers every 2 iterations. stage=-3 @@ -84,6 +86,7 @@ patch_dim1=7 # dim of convolutional kernel in the first layer pool_size=3 # size of pooling after the first convolutional layer num_filters2=256 # number of filters in the second convolutional layer patch_dim2=4 # dim of convolutional kernel in the second layer +patch_step2=1 # patch step of the second convolutional layer mix_up=0 # Number of components to mix up to (should be > #tree leaves, if # specified.) @@ -262,10 +265,8 @@ if [ $stage -le -2 ]; then tot_input_dim=$[$feat_dim*$tot_splice] num_patch1=$[1+($feat_dim-$patch_dim1)/$patch_step1] num_pool=$[$num_patch1/$pool_size] - patch_dim2=$[$patch_dim2*$num_filters1] - patch_step2=$num_filters1 - patch_stride2=$[$num_pool*$num_filters1] # same as pool outputs - num_patch2=$[1+($num_pool*$num_filters1-$patch_dim2)/$patch_step2] + patch_stride2=$num_pool + num_patch2=$[1+($patch_stride2-$patch_dim2)/$patch_step2] conv_out_dim1=$[$num_filters1*$num_patch1] # 128 x (36 - 7 + 1) pool_out_dim=$[$num_filters1*$num_pool] conv_out_dim2=$[$num_filters2*$num_patch2] @@ -284,7 +285,7 @@ SoftmaxComponent dim=$num_leaves EOF cat >$dir/replace.1.config < #tree leaves, if # specified.) num_threads=16 -parallel_opts="--num-threads 16 --mem 1G" +parallel_opts="--num-threads 16 --mem 1G" # by default we use 16 threads; this lets the queue know. # note: parallel_opts doesn't automatically get adjusted if you adjust num-threads. combine_num_threads=8 @@ -87,12 +90,12 @@ lda_opts= lda_dim= egs_opts= transform_dir= # If supplied, overrides alidir -cmvn_opts= # will be passed to get_lda.sh and get_egs.sh, if supplied. +cmvn_opts= # will be passed to get_lda.sh and get_egs.sh, if supplied. # only relevant for "raw" features, not lda. feat_type= # Can be used to force "raw" features. align_cmd= # The cmd that is passed to steps/nnet2/align.sh align_use_gpu= # Passed to use_gpu in steps/nnet2/align.sh [yes/no] -realign_times= # List of times on which we realign. Each time is +realign_times= # List of times on which we realign. Each time is # floating point number strictly between 0 and 1, which # will be multiplied by the num-iters to get an iteration # number. @@ -143,15 +146,15 @@ if [ $# != 4 ]; then echo " # Format : layer/....layer/ " echo " # (note: we splice processed, typically 40-dimensional frames" echo " --lda-dim # Dimension to reduce spliced features to with LDA" - echo " --realign-epochs # A list of space-separated epoch indices the beginning of which" - echo " # realignment is to be done" + echo " --realign-times # A list of space-separated floating point numbers between 0.0 and" + echo " # 1.0 to specify how far through training realignment is to be done" echo " --align-cmd (utils/run.pl|utils/queue.pl ) # passed to align.sh" echo " --align-use-gpu (yes/no) # specify is gpu is to be used for realignment" echo " --num-jobs-align <#njobs|30> # Number of jobs to perform realignment" echo " --stage # Used to run a partially-completed training process from somewhere in" echo " # the middle." - + exit 1; fi @@ -258,7 +261,7 @@ if [ $stage -le -2 ]; then online_preconditioning_opts="alpha=$alpha num-samples-history=$num_samples_history update-period=$update_period rank-in=$precondition_rank_in rank-out=$precondition_rank_out max-change-per-sample=$max_change_per_sample" initial_lrate=$(perl -e "print ($initial_effective_lrate*$num_jobs_initial);") - + # create the config files for nnet initialization python steps/nnet2/make_multisplice_configs.py \ --splice-indexes "$splice_indexes" \ @@ -279,7 +282,7 @@ if [ $stage -le -2 ]; then nnet-am-init $alidir/tree $lang/topo "nnet-init $dir/nnet.config -|" \ $dir/0.mdl || exit 1; fi - +if [ $pnorm_input_dim -eq $pnorm_output_dim ] && [ $fix_nnet ]; then fix_nnet=true;fi if [ $stage -le -1 ]; then echo "Training transition probabilities and setting priors" $cmd $dir/log/train_trans.log \ @@ -290,16 +293,16 @@ if [ $stage -le -1 ]; then echo "prepare initial vector for FixedScaleComponent before softmax" echo "use priors^$presoftmax_prior_scale_power and rescale to average 1" - # obtains raw pdf count + # obtains raw pdf count $cmd JOB=1:$nj $dir/log/acc_pdf.JOB.log \ ali-to-post "ark:gunzip -c $alidir/ali.JOB.gz|" ark:- \| \ post-to-tacc --per-pdf=true --binary=false $alidir/final.mdl ark:- $dir/JOB.pacc || exit 1; cat $dir/*.pacc > $dir/pacc rm $dir/*.pacc awk -v power=$presoftmax_prior_scale_power \ - '{ for(i=2; i<=NF-1; i++) {sum[i]+=$i} } + '{ for(i=2; i<=NF-1; i++) {sum[i]+=$i} } END { - for (i=2; i<=NF-1; i++) {total+=sum[i]} + for (i=2; i<=NF-1; i++) {total+=sum[i]} ave_pdf=int(total/(NF-2)); total+=0.01*ave_pdf*(NF-2) for (i=2; i<=NF-1; i++) {rescale+=((sum[i]+0.01*ave_pdf)/total)^power} rescale/=(NF-2) @@ -310,7 +313,7 @@ if [ $stage -le -1 ]; then echo "insert an additional layer of FixedScaleComponent before softmax" inp=`nnet-am-info $dir/0.mdl | grep 'Softmax' | awk '{print $2}'` nnet-init $dir/per_element.config - | nnet-insert --insert-at=$inp --randomize-next-component=false $dir/0.mdl - $dir/0.mdl - fi + fi fi # set num_iters so that as close as possible, we process the data $num_epochs @@ -393,13 +396,13 @@ cur_egs_dir=$egs_dir while [ $x -lt $num_iters ]; do [ $x -eq $exit_stage ] && echo "$0: Exiting early due to --exit-stage $exit_stage" && exit 0; - + if [ $x -gt $[$num_iters/2] ]; then fix_nnet=false; fi this_num_jobs=$(perl -e "print int(0.5+$num_jobs_initial+($num_jobs_final-$num_jobs_initial)*$x/$num_iters);") ilr=$initial_effective_lrate; flr=$final_effective_lrate; np=$num_archives_processed; nt=$num_archives_to_process; this_learning_rate=$(perl -e "print (($x + 1 >= $num_iters ? $flr : $ilr*exp($np*log($flr/$ilr)/$nt))*$this_num_jobs);"); - echo "On iteration $x, learning rate is $this_learning_rate." + echo "On iteration $x, learning rate is $this_learning_rate." if [ ! -z "${realign_this_iter[$x]}" ]; then prev_egs_dir=$cur_egs_dir @@ -444,7 +447,7 @@ while [ $x -lt $num_iters ]; do steps/nnet2/remove_egs.sh $prev_egs_dir fi fi - + # Set off jobs doing some diagnostics, in the background. # Use the egs dir from the previous iteration for the diagnostics $cmd $dir/log/compute_prob_valid.$x.log \ @@ -496,7 +499,7 @@ while [ $x -lt $num_iters ]; do ( # this sub-shell is so that when we "wait" below, # we only wait for the training jobs that we just spawned, # not the diagnostic jobs that we spawned above. - + # We can't easily use a single parallel SGE job to do the main training, # because the computation of which archive and which --frame option # to use for each job is a little complex, so we spawn each one separately. @@ -512,7 +515,7 @@ while [ $x -lt $num_iters ]; do $cmd $parallel_opts $dir/log/train.$x.$n.log \ nnet-train$parallel_suffix $parallel_train_opts \ --minibatch-size=$this_minibatch_size --srand=$x "$mdl" \ - "ark:nnet-copy-egs --frame=$frame ark:$cur_egs_dir/egs.$archive.ark ark:-|nnet-shuffle-egs --buffer-size=$shuffle_buffer_size --srand=$x ark:- ark:-|" \ + "ark,bg:nnet-copy-egs --frame=$frame ark:$cur_egs_dir/egs.$archive.ark ark:-|nnet-shuffle-egs --buffer-size=$shuffle_buffer_size --srand=$x ark:- ark:-|" \ $dir/$[$x+1].$n.mdl || touch $dir/.error & done wait @@ -535,11 +538,15 @@ while [ $x -lt $num_iters ]; do n=$(perl -e '($nj,$pat)=@ARGV; $best_n=1; $best_logprob=-1.0e+10; for ($n=1;$n<=$nj;$n++) { $fn = sprintf($pat,$n); open(F, "<$fn") || die "Error opening log file $fn"; undef $logprob; while () { if (m/log-prob-per-frame=(\S+)/) { $logprob=$1; } } - close(F); if (defined $logprob && $logprob > $best_logprob) { $best_logprob=$logprob; + close(F); if (defined $logprob && $logprob > $best_logprob) { $best_logprob=$logprob; $best_n=$n; } } print "$best_n\n"; ' $num_jobs_nnet $dir/log/train.$x.%d.log) || exit 1; [ -z "$n" ] && echo "Error getting best model" && exit 1; cp $dir/$[$x+1].$n.mdl $dir/$[$x+1].mdl || exit 1; fi + if $fix_nnet; then + # do nnet-am-fix to fix some pathology in the network + nnet-am-fix --max-average-deriv=$max_average --min-average-deriv=$min_average $dir/$[$x+1].mdl $dir/$[$x+1].mdl 2>$dir/log/fix.$x.log || exit; + fi if [ "$mix_up" -gt 0 ] && [ $x -eq $mix_up_iter ]; then echo "Warning: the mix up opertion is disabled!" @@ -569,7 +576,7 @@ if [ $stage -le $num_iters ]; then cur_offset=0 # current offset from first_model_combine. for n in $(seq $max_models_combine); do next_offset=$[($n*$num_models_combine)/$max_models_combine] - sub_list="" + sub_list="" for o in $(seq $cur_offset $[$next_offset-1]); do iter=$[$first_model_combine+$o] mdl=$dir/$iter.mdl diff --git a/egs/wsj/s5/steps/nnet2/train_multisplice_ensemble.sh b/egs/wsj/s5/steps/nnet2/train_multisplice_ensemble.sh index 02f02804153..a5cef8aea44 100755 --- a/egs/wsj/s5/steps/nnet2/train_multisplice_ensemble.sh +++ b/egs/wsj/s5/steps/nnet2/train_multisplice_ensemble.sh @@ -490,7 +490,7 @@ while [ $x -lt $num_iters ]; do nnet-train-ensemble \ --minibatch-size=$this_minibatch_size --srand=$x \ --beta=$beta $nnets_ensemble_in \ - "ark:nnet-copy-egs --frame=$frame ark:$cur_egs_dir/egs.$archive.ark ark:-|nnet-shuffle-egs --buffer-size=$shuffle_buffer_size --srand=$x ark:- ark:-|" \ + "ark,bg:nnet-copy-egs --frame=$frame ark:$cur_egs_dir/egs.$archive.ark ark:-|nnet-shuffle-egs --buffer-size=$shuffle_buffer_size --srand=$x ark:- ark:-|" \ ark:- $nnets_ensemble_out || touch $dir/.error & done wait diff --git a/egs/wsj/s5/steps/nnet2/train_pnorm_accel2.sh b/egs/wsj/s5/steps/nnet2/train_pnorm_accel2.sh index 01dbe9b5dbf..4176d347ccd 100755 --- a/egs/wsj/s5/steps/nnet2/train_pnorm_accel2.sh +++ b/egs/wsj/s5/steps/nnet2/train_pnorm_accel2.sh @@ -1,6 +1,6 @@ #!/bin/bash -# Copyright 2012-2014 Johns Hopkins University (Author: Daniel Povey). +# Copyright 2012-2014 Johns Hopkins University (Author: Daniel Povey). # 2013 Xiaohui Zhang # 2013 Guoguo Chen # 2014 Vimal Manohar @@ -27,12 +27,12 @@ num_epochs=15 # Number of epochs of training; initial_effective_lrate=0.01 final_effective_lrate=0.001 bias_stddev=0.5 -pnorm_input_dim=3000 +pnorm_input_dim=3000 pnorm_output_dim=300 p=2 minibatch_size=128 # by default use a smallish minibatch size for neural net # training; this controls instability which would otherwise - # be a problem with multi-threaded update. + # be a problem with multi-threaded update. samples_per_iter=400000 # each iteration of training, see this many samples # per job. This option is passed to get_egs.sh @@ -77,7 +77,7 @@ precondition_rank_out=80 # relates to online preconditioning mix_up=0 # Number of components to mix up to (should be > #tree leaves, if # specified.) num_threads=16 -parallel_opts="--num-threads 16 --mem 1G" +parallel_opts="--num-threads 16 --mem 1G" # by default we use 16 threads; this lets the queue know. # note: parallel_opts doesn't automatically get adjusted if you adjust num-threads. combine_num_threads=8 @@ -90,12 +90,12 @@ egs_opts= io_opts="-tc 5" # for jobs with a lot of I/O, limits the number running at one time. transform_dir= # If supplied, overrides alidir postdir= -cmvn_opts= # will be passed to get_lda.sh and get_egs.sh, if supplied. +cmvn_opts= # will be passed to get_lda.sh and get_egs.sh, if supplied. # only relevant for "raw" features, not lda. feat_type= # Can be used to force "raw" features. align_cmd= # The cmd that is passed to steps/nnet2/align.sh align_use_gpu= # Passed to use_gpu in steps/nnet2/align.sh [yes/no] -realign_times= # List of times on which we realign. Each time is +realign_times= # List of times on which we realign. Each time is # floating point number strictly between 0 and 1, which # will be multiplied by the num-iters to get an iteration # number. @@ -139,15 +139,15 @@ if [ $# != 4 ]; then echo " --splice-width # Number of frames on each side to append for feature input" echo " # (note: we splice processed, typically 40-dimensional frames" echo " --lda-dim # Dimension to reduce spliced features to with LDA" - echo " --realign-epochs # A list of space-separated epoch indices the beginning of which" - echo " # realignment is to be done" + echo " --realign-times # A list of space-separated floating point numbers between 0.0 and" + echo " # 1.0 to specify how far through training realignment is to be done" echo " --align-cmd (utils/run.pl|utils/queue.pl ) # passed to align.sh" echo " --align-use-gpu (yes/no) # specify is gpu is to be used for realignment" echo " --num-jobs-align <#njobs|30> # Number of jobs to perform realignment" echo " --stage # Used to run a partially-completed training process from somewhere in" echo " # the middle." - + exit 1; fi @@ -205,7 +205,7 @@ ivector_dim=$(cat $dir/ivector_dim) || exit 1; lda_dim=$(cat $dir/lda_dim) || exit 1; if [ $stage -le -3 ] && [ -z "$egs_dir" ]; then - echo "$0: calling get_egs2.sh" + echo "$0: calling get_egs2.sh" steps/nnet2/get_egs2.sh $egs_opts "${extra_opts[@]}" --io-opts "$io_opts" \ --postdir "$postdir" --samples-per-iter $samples_per_iter --stage $get_egs_stage \ --cmd "$cmd" $egs_opts $data $alidir $dir/egs || exit 1; @@ -253,7 +253,7 @@ SoftmaxComponent dim=$num_leaves EOF # to hidden.config it will write the part of the config corresponding to a - # single hidden layer; we need this to add new layers. + # single hidden layer; we need this to add new layers. cat >$dir/hidden.config <) { if (m/log-prob-per-frame=(\S+)/) { $logprob=$1; } } - close(F); if (defined $logprob && $logprob > $best_logprob) { $best_logprob=$logprob; + close(F); if (defined $logprob && $logprob > $best_logprob) { $best_logprob=$logprob; $best_n=$n; } } print "$best_n\n"; ' $num_jobs_nnet $dir/log/train.$x.%d.log) || exit 1; [ -z "$n" ] && echo "Error getting best model" && exit 1; cp $dir/$[$x+1].$n.mdl $dir/$[$x+1].mdl || exit 1; @@ -521,7 +521,7 @@ if [ $stage -le $num_iters ]; then cur_offset=0 # current offset from first_model_combine. for n in $(seq $max_models_combine); do next_offset=$[($n*$num_models_combine)/$max_models_combine] - sub_list="" + sub_list="" for o in $(seq $cur_offset $[$next_offset-1]); do iter=$[$first_model_combine+$o] mdl=$dir/$iter.mdl diff --git a/egs/wsj/s5/steps/nnet2/train_pnorm_multisplice2.sh b/egs/wsj/s5/steps/nnet2/train_pnorm_multisplice2.sh index 2708eb85636..3e6c0c2ed96 100755 --- a/egs/wsj/s5/steps/nnet2/train_pnorm_multisplice2.sh +++ b/egs/wsj/s5/steps/nnet2/train_pnorm_multisplice2.sh @@ -429,7 +429,7 @@ while [ $x -lt $num_iters ]; do $cmd $parallel_opts $dir/log/train.$x.$n.log \ nnet-train$parallel_suffix $parallel_train_opts \ --minibatch-size=$this_minibatch_size --srand=$x "$mdl" \ - "ark:nnet-copy-egs --frame=$frame ark:$cur_egs_dir/egs.$archive.ark ark:-|nnet-shuffle-egs --buffer-size=$shuffle_buffer_size --srand=$x ark:- ark:-|" \ + "ark,bg:nnet-copy-egs --frame=$frame ark:$cur_egs_dir/egs.$archive.ark ark:-|nnet-shuffle-egs --buffer-size=$shuffle_buffer_size --srand=$x ark:- ark:-|" \ $dir/$[$x+1].$n.mdl || touch $dir/.error & done wait diff --git a/egs/wsj/s5/steps/nnet2/train_pnorm_simple2.sh b/egs/wsj/s5/steps/nnet2/train_pnorm_simple2.sh index 44639ebd2d9..fe0f4cf7a37 100755 --- a/egs/wsj/s5/steps/nnet2/train_pnorm_simple2.sh +++ b/egs/wsj/s5/steps/nnet2/train_pnorm_simple2.sh @@ -465,7 +465,7 @@ while [ $x -lt $num_iters ]; do $cmd $parallel_opts $dir/log/train.$x.$n.log \ nnet-train$parallel_suffix $parallel_train_opts \ --minibatch-size=$this_minibatch_size --srand=$x "$mdl" \ - "ark:nnet-copy-egs --frame=$frame ark:$cur_egs_dir/egs.$archive.ark ark:-|nnet-shuffle-egs --buffer-size=$shuffle_buffer_size --srand=$x ark:- ark:-|" \ + "ark,bg:nnet-copy-egs --frame=$frame ark:$cur_egs_dir/egs.$archive.ark ark:-|nnet-shuffle-egs --buffer-size=$shuffle_buffer_size --srand=$x ark:- ark:-|" \ $dir/$[$x+1].$n.mdl || touch $dir/.error & done wait diff --git a/egs/wsj/s5/steps/nnet3/adjust_priors.sh b/egs/wsj/s5/steps/nnet3/adjust_priors.sh new file mode 100755 index 00000000000..60d377f18e8 --- /dev/null +++ b/egs/wsj/s5/steps/nnet3/adjust_priors.sh @@ -0,0 +1,101 @@ +#!/bin/bash + +. path.sh + +# This script computes the DNN output averaged over a small subset of +# training egs and stores it in post.$iter.vec. +# This is used for the purpose of adjusting the nnet priors. +# When --use-raw-nnet is false, then the computed priors is added into the +# nnet model; hence the term adjust priors. +# When --use-raw-nnet is true, the computed priors is not added into the +# nnet model and left in the file post.$iter.vec. + +cmd=run.pl +prior_subset_size=20000 # 20k samples per job, for computing priors. +num_jobs_compute_prior=10 # these are single-threaded, run on CPU. +use_gpu=false # if true, we run on GPU. +egs_type=egs # Compute from $egs_type.*.ark in $egs_dir + # If --egs-type is degs, then the program + # nnet3-discriminative-compute-from-egs is used + # instead of nnet3-compute-from-egs. +use_raw_nnet=false # If raw nnet, the averaged posterior is computed + # and stored in post.$iter.vec; but there is no + # adjusting of priors +iter=final + +. utils/parse_options.sh + +echo "$0 $@" # Print the command line for logging + +if [ $# -ne 2 ]; then + echo "Usage: $0 [opts] " + echo " e.g.: $0 exp/nnet3_sad_snr/tdnn_train_100k_whole_1k_splice2_2_relu500" + exit 1 +fi + +dir=$1 +egs_dir=$2 + +if $use_gpu; then + prior_gpu_opt="--use-gpu=yes" + prior_queue_opt="--gpu 1" +else + prior_gpu_opt="--use-gpu=no" + prior_queue_opt="" +fi + +for f in $egs_dir/$egs_type.1.ark $egs_dir/info/num_archives; do + if [ ! -f $f ]; then + echo "$f not found" + exit 1 + fi +done + +if $use_raw_nnet; then + model=$dir/$iter.raw +else + model="nnet3-am-copy --raw=true $dir/$iter.mdl - |" +fi + +rm -f $dir/post.$iter.*.vec 2>/dev/null + +left_context=`cat $egs_dir/info/left_context` || exit 1 +right_context=`cat $egs_dir/info/right_context` || exit 1 + +context_opts="--left-context=$left_context --right-context=$right_context" + +num_archives=$(cat $egs_dir/info/num_archives) || { echo "error: no such file $egs_dir/info/frames_per_eg"; exit 1; } +if [ $num_jobs_compute_prior -gt $num_archives ]; then egs_part=1; +else egs_part=JOB; fi + +if [ $egs_type != "degs" ]; then + $cmd JOB=1:$num_jobs_compute_prior $prior_queue_opt $dir/log/get_post.$iter.JOB.log \ + nnet3-copy-egs ark:$egs_dir/$egs_type.$egs_part.ark ark:- \| \ + nnet3-subset-egs --srand=JOB --n=$prior_subset_size ark:- ark:- \| \ + nnet3-merge-egs ark:- ark:- \| \ + nnet3-compute-from-egs $prior_gpu_opt --apply-exp=true \ + "$model" ark:- ark:- \| \ + matrix-sum-rows ark:- ark:- \| vector-sum ark:- $dir/post.$iter.JOB.vec || exit 1; +else + $cmd JOB=1:$num_jobs_compute_prior $prior_queue_opt $dir/log/get_post.$iter.JOB.log \ + nnet3-discriminative-copy-egs ark:$egs_dir/$egs_type.$egs_part.ark ark:- \| \ + nnet3-discriminative-subset-egs --srand=JOB --n=$prior_subset_size ark:- ark:- \| \ + nnet3-discriminative-merge-egs ark:- ark:- \| \ + nnet3-compute-from-degs $prior_gpu_opt --apply-exp=true \ + "$model" ark:- ark:- \| \ + matrix-sum-rows ark:- ark:- \| vector-sum ark:- $dir/post.$iter.JOB.vec || exit 1; + +fi + +sleep 3; # make sure there is time for $dir/post.$iter.*.vec to appear. + +$cmd $dir/log/vector_sum.$iter.log \ + vector-sum $dir/post.$iter.*.vec $dir/post.$iter.vec || exit 1; + +if ! $use_raw_nnet; then + run.pl $dir/log/adjust_priors.$iter.log \ + nnet3-am-adjust-priors $dir/$iter.mdl $dir/post.$iter.vec $dir/$iter.adj.mdl +fi + +rm -f $dir/post.$iter.*.vec; + diff --git a/egs/wsj/s5/steps/nnet3/align.sh b/egs/wsj/s5/steps/nnet3/align.sh new file mode 100755 index 00000000000..e151876c690 --- /dev/null +++ b/egs/wsj/s5/steps/nnet3/align.sh @@ -0,0 +1,153 @@ +#!/bin/bash +# Copyright 2012 Brno University of Technology (Author: Karel Vesely) +# 2013 Johns Hopkins University (Author: Daniel Povey) +# 2015 Vijayaditya Peddinti +# 2016 Vimal Manohar +# Apache 2.0 + +# Computes training alignments using nnet3 DNN + +# Begin configuration section. +nj=4 +cmd=run.pl +# Begin configuration. +scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1" +beam=10 +retry_beam=40 +transform_dir= +iter=final +use_gpu=true +frames_per_chunk=50 +extra_left_context=0 +extra_right_context=0 +extra_left_context_initial=-1 +extra_right_context_final=-1 +online_ivector_dir= +feat_type= # you can set this to force it to use delta features. +# End configuration options. + +echo "$0 $@" # Print the command line for logging + +[ -f path.sh ] && . ./path.sh # source the path. +. parse_options.sh || exit 1; + +if [ $# != 4 ]; then + echo "Usage: $0 [--transform-dir ] " + echo "e.g.: $0 data/train data/lang exp/nnet4 exp/nnet4_ali" + echo "main options (for others, see top of script file)" + echo " --config # config containing options" + echo " --nj # number of parallel jobs" + echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs." + exit 1; +fi + +data=$1 +lang=$2 +srcdir=$3 +dir=$4 + +oov=`cat $lang/oov.int` || exit 1; +mkdir -p $dir/log +echo $nj > $dir/num_jobs +sdata=$data/split$nj +[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1; + +if $use_gpu; then + queue_opt="--gpu 1" + gpu_opt="--use-gpu=yes" +else + queue_opt="" + gpu_opt="--use-gpu=no" +fi + +extra_files= +[ ! -z "$online_ivector_dir" ] && \ + extra_files="$online_ivector_dir/ivector_online.scp $online_ivector_dir/ivector_period" +for f in $srcdir/tree $srcdir/${iter}.mdl $data/feats.scp $lang/L.fst $extra_files; do + [ ! -f $f ] && echo "$0: no such file $f" && exit 1; +done + +cp $srcdir/{tree,${iter}.mdl} $dir || exit 1; + + +## Set up features. Note: these are different from the normal features +## because we have one rspecifier that has the features for the entire +## training set, not separate ones for each batch. +if [ -z "$feat_type" ]; then + if [ -f $srcdir/final.mat ]; then feat_type=lda; else feat_type=raw; fi +fi +echo "$0: feature type is $feat_type" + +cmvn_opts=`cat $srcdir/cmvn_opts 2>/dev/null` +cp $srcdir/cmvn_opts $dir 2>/dev/null + +case $feat_type in + raw) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- |" + ;; + lda) + splice_opts=`cat $srcdir/splice_opts 2>/dev/null` + cp $srcdir/splice_opts $dir 2>/dev/null + cp $srcdir/final.mat $dir || exit 1; + feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $dir/final.mat ark:- ark:- |" + ;; + *) echo "$0: invalid feature type $feat_type" && exit 1; +esac + +if [ ! -z "$transform_dir" ]; then + echo "$0: using transforms from $transform_dir" + [ ! -s $transform_dir/num_jobs ] && \ + echo "$0: expected $transform_dir/num_jobs to contain the number of jobs." && exit 1; + nj_orig=$(cat $transform_dir/num_jobs) + + if [ $feat_type == "raw" ]; then trans=raw_trans; + else trans=trans; fi + if [ $feat_type == "lda" ] && ! cmp $transform_dir/final.mat $srcdir/final.mat; then + echo "$0: LDA transforms differ between $srcdir and $transform_dir" + exit 1; + fi + if [ ! -f $transform_dir/$trans.1 ]; then + echo "$0: expected $transform_dir/$trans.1 to exist (--transform-dir option)" + exit 1; + fi + if [ $nj -ne $nj_orig ]; then + # Copy the transforms into an archive with an index. + for n in $(seq $nj_orig); do cat $transform_dir/$trans.$n; done | \ + copy-feats ark:- ark,scp:$dir/$trans.ark,$dir/$trans.scp || exit 1; + feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk scp:$dir/$trans.scp ark:- ark:- |" + else + # number of jobs matches with alignment dir. + feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$transform_dir/$trans.JOB ark:- ark:- |" + fi +fi + +ivector_opts= +if [ ! -z "$online_ivector_dir" ]; then + ivector_period=$(cat $online_ivector_dir/ivector_period) || exit 1; + # note: subsample-feats, with negative n, will repeat each feature -n times. + ivector_opts="--online-ivectors=scp:$online_ivector_dir/ivector_online.scp --online-ivector_period=$ivector_period" +fi + +echo "$0: aligning data in $data using model from $srcdir, putting alignments in $dir" + +tra="ark:utils/sym2int.pl --map-oov $oov -f 2- $lang/words.txt $sdata/JOB/text|"; + +frame_subsampling_opt= +if [ -f $srcdir/frame_subsampling_factor ]; then + # e.g. for 'chain' systems + frame_subsampling_opt="--frame-subsampling-factor=$(cat $srcdir/frame_subsampling_factor)" + cp $srcdir/frame_subsampling_factor $dir +fi + +$cmd $queue_opt JOB=1:$nj $dir/log/align.JOB.log \ + compile-train-graphs $dir/tree $srcdir/${iter}.mdl $lang/L.fst "$tra" ark:- \| \ + nnet3-align-compiled $scale_opts $ivector_opts $frame_subsampling_opt \ + --frames-per-chunk=$frames_per_chunk \ + --extra-left-context=$extra_left_context \ + --extra-right-context=$extra_right_context \ + --extra-left-context-initial=$extra_left_context_initial \ + --extra-right-context-final=$extra_right_context_final \ + $gpu_opt --beam=$beam --retry-beam=$retry_beam \ + $srcdir/${iter}.mdl ark:- "$feats" "ark:|gzip -c >$dir/ali.JOB.gz" || exit 1; + +echo "$0: done aligning data." + diff --git a/egs/wsj/s5/steps/nnet3/chain/build_tree.sh b/egs/wsj/s5/steps/nnet3/chain/build_tree.sh new file mode 100755 index 00000000000..a2cb9927393 --- /dev/null +++ b/egs/wsj/s5/steps/nnet3/chain/build_tree.sh @@ -0,0 +1,179 @@ +#!/bin/bash +# Copyright 2012-2015 Johns Hopkins University (Author: Daniel Povey). +# Apache 2.0. + + +# This script builds a tree for use in the 'chain' systems (although the script +# itself is pretty generic and doesn't use any 'chain' binaries). This is just +# like the first stages of a standard system, like 'train_sat.sh', except it +# does 'convert-ali' to convert alignments to a monophone topology just created +# from the 'lang' directory (in case the topology is different from where you +# got the system's alignments from), and it stops after the tree-building and +# model-initialization stage, without re-estimating the Gaussians or training +# the transitions. + + +# Begin configuration section. +stage=-5 +exit_stage=-100 # you can use this to require it to exit at the + # beginning of a specific stage. Not all values are + # supported. +cmd=run.pl +context_opts= # e.g. set this to "--context-width 5 --central-position 2" for quinphone. +cluster_thresh=-1 # for build-tree control final bottom-up clustering of leaves +frame_subsampling_factor=1 +leftmost_questions_truncate=10 +tree_stats_opts= +cluster_phones_opts= +# End configuration section. + +echo "$0 $@" # Print the command line for logging + +[ -f path.sh ] && . ./path.sh +. parse_options.sh || exit 1; + +if [ $# != 5 ]; then + echo "Usage: steps/train_sat.sh <#leaves> " + echo " e.g.: steps/train_sat.sh 2500 15000 data/train_si84 data/lang exp/tri2b_ali_si84 exp/tri3b" + echo "Main options (for others, see top of script file)" + echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs." + echo " --config # config containing options" + echo " --stage # stage to do partial re-run from." + exit 1; +fi + +numleaves=$1 +data=$2 +lang=$3 +alidir=$4 +dir=$5 + +for f in $data/feats.scp $lang/phones.txt $alidir/final.mdl $alidir/ali.1.gz; do + [ ! -f $f ] && echo "train_sat.sh: no such file $f" && exit 1; +done + +oov=`cat $lang/oov.int` +nj=`cat $alidir/num_jobs` || exit 1; +silphonelist=`cat $lang/phones/silence.csl` +ciphonelist=`cat $lang/phones/context_indep.csl` || exit 1; +sdata=$data/split$nj; +splice_opts=`cat $alidir/splice_opts 2>/dev/null` # frame-splicing options. +cmvn_opts=`cat $alidir/cmvn_opts 2>/dev/null` +delta_opts=`cat $alidir/delta_opts 2>/dev/null` + +mkdir -p $dir/log +cp $alidir/splice_opts $dir 2>/dev/null # frame-splicing options. +cp $alidir/cmvn_opts $dir 2>/dev/null # cmn/cmvn option. +cp $alidir/delta_opts $dir 2>/dev/null # delta option. + +echo $nj >$dir/num_jobs +[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1; + +# Set up features. + +if [ -f $alidir/final.mat ]; then feat_type=lda; else feat_type=delta; fi +echo "$0: feature type is $feat_type" + +## Set up speaker-independent features. +case $feat_type in + delta) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas $delta_opts ark:- ark:- |";; + lda) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $alidir/final.mat ark:- ark:- |" + cp $alidir/final.mat $dir + cp $alidir/full.mat $dir 2>/dev/null + ;; + *) echo "$0: invalid feature type $feat_type" && exit 1; +esac + +# Add fMLLR transforms if available +if [ -f $alidir/trans.1 ]; then + echo "$0: Using transforms from $alidir" + feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark,s,cs:$alidir/trans.JOB ark:- ark:- |" +fi + +# Do subsampling of feats, if needed +if [ $frame_subsampling_factor -gt 1 ]; then + feats="$feats subsample-feats --n=$frame_subsampling_factor ark:- ark:- |" +fi + +if [ $stage -le -5 ]; then + echo "$0: Initializing monophone model (for alignment conversion, in case topology changed)" + + [ ! -f $lang/phones/sets.int ] && exit 1; + shared_phones_opt="--shared-phones=$lang/phones/sets.int" + # get feature dimension + example_feats="`echo $feats | sed s/JOB/1/g`"; + if ! feat_dim=$(feat-to-dim "$example_feats" - 2>/dev/null) || [ -z $feat_dim ]; then + feat-to-dim "$example_feats" - # to see the error message. + echo "error getting feature dimension" + exit 1; + fi + $cmd JOB=1 $dir/log/init_mono.log \ + gmm-init-mono $shared_phones_opt "--train-feats=$feats subset-feats --n=10 ark:- ark:-|" $lang/topo $feat_dim \ + $dir/mono.mdl $dir/mono.tree || exit 1; +fi + + +if [ $stage -le -4 ]; then + # Get tree stats. + echo "$0: Accumulating tree stats" + $cmd JOB=1:$nj $dir/log/acc_tree.JOB.log \ + convert-ali --frame-subsampling-factor=$frame_subsampling_factor \ + $alidir/final.mdl $dir/mono.mdl $dir/mono.tree "ark:gunzip -c $alidir/ali.JOB.gz|" ark:- \| \ + acc-tree-stats $context_opts $tree_stats_opts --ci-phones=$ciphonelist $dir/mono.mdl \ + "$feats" ark:- $dir/JOB.treeacc || exit 1; + [ "`ls $dir/*.treeacc | wc -w`" -ne "$nj" ] && echo "$0: Wrong #tree-accs" && exit 1; + $cmd $dir/log/sum_tree_acc.log \ + sum-tree-stats $dir/treeacc $dir/*.treeacc || exit 1; + rm $dir/*.treeacc +fi + +if [ $stage -le -3 ] && $train_tree; then + echo "$0: Getting questions for tree clustering." + # preparing questions, roots file... + $cmd $dir/log/questions.log \ + cluster-phones $cluster_phones_opts $context_opts $dir/treeacc \ + $lang/phones/sets.int $dir/questions.int || exit 1; + cat $lang/phones/extra_questions.int >> $dir/questions.int + $cmd $dir/log/compile_questions.log \ + compile-questions --leftmost-questions-truncate=$leftmost_questions_truncate \ + $context_opts $lang/topo $dir/questions.int $dir/questions.qst || exit 1; + + # questions_truncated.int will be needed later on when we build the phone + # language model for 'chain' training. It's a mechanism of keeping the graph + # small. + if [ $leftmost_questions_truncate -gt 0 ]; then + head -n $leftmost_questions_truncate $dir/questions.int > $dir/questions_truncated.int + else + cp $dir/questions.int $dir/questions_truncated.int + fi + + echo "$0: Building the tree" + $cmd $dir/log/build_tree.log \ + build-tree $context_opts --verbose=1 --max-leaves=$numleaves \ + --cluster-thresh=$cluster_thresh $dir/treeacc $lang/phones/roots.int \ + $dir/questions.qst $lang/topo $dir/tree || exit 1; +fi + +if [ $stage -le -2 ]; then + echo "$0: Initializing the model" + gmm-init-model --write-occs=$dir/1.occs \ + $dir/tree $dir/treeacc $lang/topo $dir/1.mdl 2> $dir/log/init_model.log || exit 1; + grep 'no stats' $dir/log/init_model.log && echo "This is a bad warning."; + rm $dir/treeacc +fi + +if [ $stage -le -1 ]; then + # Convert the1alignments to the new tree. Note: we likely will not use these + # converted alignments in the CTC system directly, but they could be useful + # for other purposes. + echo "$0: Converting alignments from $alidir to use current tree" + $cmd JOB=1:$nj $dir/log/convert.JOB.log \ + convert-ali --frame-subsampling-factor=$frame_subsampling_factor \ + $alidir/final.mdl $dir/1.mdl $dir/tree \ + "ark:gunzip -c $alidir/ali.JOB.gz|" "ark:|gzip -c >$dir/ali.JOB.gz" || exit 1; +fi + +cp $dir/1.mdl $dir/final.mdl + +echo $0: Done building tree + diff --git a/egs/wsj/s5/steps/nnet3/chain/gen_topo.pl b/egs/wsj/s5/steps/nnet3/chain/gen_topo.pl new file mode 100755 index 00000000000..32dfa272a97 --- /dev/null +++ b/egs/wsj/s5/steps/nnet3/chain/gen_topo.pl @@ -0,0 +1,42 @@ +#!/usr/bin/env perl + +# Copyright 2012 Johns Hopkins University (author: Daniel Povey) + +# Generate a topology file. This allows control of the number of states in the +# non-silence HMMs, and in the silence HMMs. This is a modified version of +# 'utils/gen_topo.pl' that generates a different type of topology, one that we +# believe should be useful in the 'chain' model. Note: right now it doesn't +# have any real options, and it treats silence and nonsilence the same. The +# intention is that you write different versions of this script, or add options, +# if you experiment with it. + +if (@ARGV != 2) { + print STDERR "Usage: utils/gen_topo.pl \n"; + print STDERR "e.g.: utils/gen_topo.pl 4:5:6:7:8:9:10 1:2:3\n"; + exit (1); +} + +($nonsil_phones, $sil_phones) = @ARGV; + +$nonsil_phones =~ s/:/ /g; +$sil_phones =~ s/:/ /g; +$nonsil_phones =~ m/^\d[ \d]+$/ || die "$0: bad arguments @ARGV\n"; +$sil_phones =~ m/^\d[ \d]*$/ || die "$0: bad arguments @ARGV\n"; + +print "\n"; +print "\n"; +print "\n"; +print "$nonsil_phones $sil_phones\n"; +print "\n"; +# The next two lines may look like a bug, but they are as intended. State 0 has +# no self-loop, it happens exactly once. And it can go either to state 1 (with +# a self-loop) or to state 2, so we can have zero or more instances of state 1 +# following state 0. +# We make the transition-probs 0.5 so they normalize, to keep the code happy. +# In fact, we always set the transition probability scale to 0.0 in the 'chain' +# code, so they are never used. +print " 0 0 1 0.5 2 0.5 \n"; +print " 1 1 1 0.5 2 0.5 \n"; +print " 2 \n"; +print "\n"; +print "\n"; diff --git a/egs/wsj/s5/steps/nnet3/chain/gen_topo.py b/egs/wsj/s5/steps/nnet3/chain/gen_topo.py new file mode 100755 index 00000000000..fdd7a02fd88 --- /dev/null +++ b/egs/wsj/s5/steps/nnet3/chain/gen_topo.py @@ -0,0 +1,49 @@ +#!/usr/bin/env python + +# Copyright 2012 Johns Hopkins University (author: Daniel Povey) + +# Generate a topology file. This allows control of the number of states in the +# non-silence HMMs, and in the silence HMMs. This is a modified version of +# 'utils/gen_topo.pl' that generates a different type of topology, one that we +# believe should be useful in the 'chain' model. Note: right now it doesn't +# have any real options, and it treats silence and nonsilence the same. The +# intention is that you write different versions of this script, or add options, +# if you experiment with it. + +from __future__ import print_function +import argparse + + +parser = argparse.ArgumentParser(description="Usage: steps/nnet3/chain/gen_topo.py " + " " + "e.g.: steps/nnet3/chain/gen_topo.pl 4:5:6:7:8:9:10 1:2:3\n", + epilog="See egs/swbd/s5c/local/chain/train_tdnn_a.sh for example of usage."); +parser.add_argument("nonsilence_phones", type=str, + help="List of non-silence phones as integers, separated by colons, e.g. 4:5:6:7:8:9"); +parser.add_argument("silence_phones", type=str, + help="List of silence phones as integers, separated by colons, e.g. 1:2:3"); + +args = parser.parse_args() + +silence_phones = [ int(x) for x in args.silence_phones.split(":") ] +nonsilence_phones = [ int(x) for x in args.nonsilence_phones.split(":") ] +all_phones = silence_phones + nonsilence_phones + +print("") +print("") +print("") +print(" ".join([str(x) for x in all_phones])) +print("") +# The next two lines may look like a bug, but they are as intended. State 0 has +# no self-loop, it happens exactly once. And it can go either to state 1 (with +# a self-loop) or to state 2, so we can have zero or more instances of state 1 +# following state 0. +# We make the transition-probs 0.5 so they normalize, to keep the code happy. +# In fact, we always set the transition probability scale to 0.0 in the 'chain' +# code, so they are never used. +print(" 0 0 1 0.5 2 0.5 ") +print(" 1 1 1 0.5 2 0.5 ") +print(" 2 ") +print("") +print("") + diff --git a/egs/wsj/s5/steps/nnet3/chain/gen_topo2.py b/egs/wsj/s5/steps/nnet3/chain/gen_topo2.py new file mode 100755 index 00000000000..a33dab666e6 --- /dev/null +++ b/egs/wsj/s5/steps/nnet3/chain/gen_topo2.py @@ -0,0 +1,55 @@ +#!/usr/bin/env python + +# Copyright 2012 Johns Hopkins University (author: Daniel Povey) + +# Generate a topology file. This allows control of the number of states in the +# non-silence HMMs, and in the silence HMMs. This is a modified version of +# 'utils/gen_topo.pl' that generates a different type of topology, one that we +# believe should be useful in the 'chain' model. Note: right now it doesn't +# have any real options, and it treats silence and nonsilence the same. The +# intention is that you write different versions of this script, or add options, +# if you experiment with it. + +from __future__ import print_function +import argparse + + +parser = argparse.ArgumentParser(description="Usage: steps/nnet3/chain/gen_topo.py " + " " + "e.g.: steps/nnet3/chain/gen_topo.pl 4:5:6:7:8:9:10 1:2:3\n", + epilog="See egs/swbd/s5c/local/chain/train_tdnn_a.sh for example of usage."); +parser.add_argument("nonsilence_phones", type=str, + help="List of non-silence phones as integers, separated by colons, e.g. 4:5:6:7:8:9"); +parser.add_argument("silence_phones", type=str, + help="List of silence phones as integers, separated by colons, e.g. 1:2:3"); + +args = parser.parse_args() + +silence_phones = [ int(x) for x in args.silence_phones.split(":") ] +nonsilence_phones = [ int(x) for x in args.nonsilence_phones.split(":") ] +all_phones = silence_phones + nonsilence_phones + +print("") +print("") +print("") +print(" ".join([str(x) for x in all_phones])) +print("") + +# the pdf-classes are as follows: +# pdf-class 0 is in a 1-frame sequence, the initial and final state. +# pdf-class 1 is in a sequence with >=3 frames, the 'middle' states. (important that +# it be numbered 1, which is the default list of pdf-classes used in 'cluster-phones'). +# pdf-class 2 is the initial-state in a sequence with >= 2 frames. +# pdf-class 3 is the final-state in a sequence with >= 2 frames. +# state 0 is nonemitting in this topology. + +print(" 0 1 0.5 2 0.5 ") # initial nonemitting state. +print(" 1 0 5 1.0 ") # 1-frame sequence. +print(" 2 2 3 0.5 4 0.5 ") # 2 or more frames +print(" 3 1 3 0.5 4 0.5 ") # 3 or more frames +print(" 4 3 5 1.0 ") # 2 or more frames. +print(" 5 ") # final nonemitting state + +print("") +print("") + diff --git a/egs/wsj/s5/steps/nnet3/chain/gen_topo3.py b/egs/wsj/s5/steps/nnet3/chain/gen_topo3.py new file mode 100755 index 00000000000..f43f5046813 --- /dev/null +++ b/egs/wsj/s5/steps/nnet3/chain/gen_topo3.py @@ -0,0 +1,41 @@ +#!/usr/bin/env python + +# Copyright 2012 Johns Hopkins University (author: Daniel Povey) + +# Generate a topology file. This allows control of the number of states in the +# non-silence HMMs, and in the silence HMMs. This is a modified version of +# 'utils/gen_topo.pl' that generates a different type of topology, one that we +# believe should be useful in the 'chain' model. Note: right now it doesn't +# have any real options, and it treats silence and nonsilence the same. The +# intention is that you write different versions of this script, or add options, +# if you experiment with it. + +from __future__ import print_function +import argparse + + +parser = argparse.ArgumentParser(description="Usage: steps/nnet3/chain/gen_topo.py " + " " + "e.g.: steps/nnet3/chain/gen_topo.pl 4:5:6:7:8:9:10 1:2:3\n", + epilog="See egs/swbd/s5c/local/chain/train_tdnn_a.sh for example of usage."); +parser.add_argument("nonsilence_phones", type=str, + help="List of non-silence phones as integers, separated by colons, e.g. 4:5:6:7:8:9"); +parser.add_argument("silence_phones", type=str, + help="List of silence phones as integers, separated by colons, e.g. 1:2:3"); + +args = parser.parse_args() + +silence_phones = [ int(x) for x in args.silence_phones.split(":") ] +nonsilence_phones = [ int(x) for x in args.nonsilence_phones.split(":") ] +all_phones = silence_phones + nonsilence_phones + +print("") +print("") +print("") +print(" ".join([str(x) for x in all_phones])) +print("") +print(" 0 0 0 0.5 1 0.5 ") +print(" 1 ") +print("") +print("") + diff --git a/egs/wsj/s5/steps/nnet3/chain/gen_topo4.py b/egs/wsj/s5/steps/nnet3/chain/gen_topo4.py new file mode 100755 index 00000000000..6d88a6e4449 --- /dev/null +++ b/egs/wsj/s5/steps/nnet3/chain/gen_topo4.py @@ -0,0 +1,46 @@ +#!/usr/bin/env python + +# Copyright 2012 Johns Hopkins University (author: Daniel Povey) + +# Generate a topology file. This allows control of the number of states in the +# non-silence HMMs, and in the silence HMMs. This is a modified version of +# 'utils/gen_topo.pl' that generates a different type of topology, one that we +# believe should be useful in the 'chain' model. Note: right now it doesn't +# have any real options, and it treats silence and nonsilence the same. The +# intention is that you write different versions of this script, or add options, +# if you experiment with it. + +from __future__ import print_function +import argparse + + +parser = argparse.ArgumentParser(description="Usage: steps/nnet3/chain/gen_topo.py " + " " + "e.g.: steps/nnet3/chain/gen_topo.pl 4:5:6:7:8:9:10 1:2:3\n", + epilog="See egs/swbd/s5c/local/chain/train_tdnn_a.sh for example of usage."); +parser.add_argument("nonsilence_phones", type=str, + help="List of non-silence phones as integers, separated by colons, e.g. 4:5:6:7:8:9"); +parser.add_argument("silence_phones", type=str, + help="List of silence phones as integers, separated by colons, e.g. 1:2:3"); + +args = parser.parse_args() + +silence_phones = [ int(x) for x in args.silence_phones.split(":") ] +nonsilence_phones = [ int(x) for x in args.nonsilence_phones.split(":") ] +all_phones = silence_phones + nonsilence_phones + +print("") +print("") +print("") +print(" ".join([str(x) for x in all_phones])) +print("") +# state 0 is obligatory (occurs once) +print(" 0 0 1 0.3333 2 0.3333 3 0.3333 ") +# state 1 is used only when >2 frames +print(" 1 1 1 0.5 2 0.5 ") +# state 2 is used only when >=2 frames (and occurs once) +print(" 2 2 3 1.0 ") +print(" 3 ") # final nonemitting state +print("") +print("") + diff --git a/egs/wsj/s5/steps/nnet3/chain/gen_topo5.py b/egs/wsj/s5/steps/nnet3/chain/gen_topo5.py new file mode 100755 index 00000000000..1583966b58c --- /dev/null +++ b/egs/wsj/s5/steps/nnet3/chain/gen_topo5.py @@ -0,0 +1,50 @@ +#!/usr/bin/env python + +# Copyright 2012 Johns Hopkins University (author: Daniel Povey) + +# Generate a topology file. This allows control of the number of states in the +# non-silence HMMs, and in the silence HMMs. This is a modified version of +# 'utils/gen_topo.pl' that generates a different type of topology, one that we +# believe should be useful in the 'chain' model. Note: right now it doesn't +# have any real options, and it treats silence and nonsilence the same. The +# intention is that you write different versions of this script, or add options, +# if you experiment with it. + +from __future__ import print_function +import argparse + + +parser = argparse.ArgumentParser(description="Usage: steps/nnet3/chain/gen_topo.py " + " " + "e.g.: steps/nnet3/chain/gen_topo.pl 4:5:6:7:8:9:10 1:2:3\n", + epilog="See egs/swbd/s5c/local/chain/train_tdnn_a.sh for example of usage."); +parser.add_argument("nonsilence_phones", type=str, + help="List of non-silence phones as integers, separated by colons, e.g. 4:5:6:7:8:9"); +parser.add_argument("silence_phones", type=str, + help="List of silence phones as integers, separated by colons, e.g. 1:2:3"); + +args = parser.parse_args() + +silence_phones = [ int(x) for x in args.silence_phones.split(":") ] +nonsilence_phones = [ int(x) for x in args.nonsilence_phones.split(":") ] +all_phones = silence_phones + nonsilence_phones + +print("") +print("") +print("") +print(" ".join([str(x) for x in all_phones])) +print("") +# state 0 is nonemitting +print(" 0 1 0.5 2 0.5 ") +# state 1 is for when we traverse it in 1 state +print(" 1 0 4 1.0 ") +# state 2 is for when we traverse it in >1 state, for the first state. +print(" 2 2 3 1.0 ") +# state 3 is for the self-loop. Use pdf-class 1 here so that the default +# phone-class clustering (which uses only pdf-class 1 by default) gets only +# stats from longer phones. +print(" 3 1 3 0.5 4 0.5 ") +print(" 4 ") +print("") +print("") + diff --git a/egs/wsj/s5/steps/nnet3/chain/gen_topo6.py b/egs/wsj/s5/steps/nnet3/chain/gen_topo6.py new file mode 100755 index 00000000000..d62cd4aaee4 --- /dev/null +++ b/egs/wsj/s5/steps/nnet3/chain/gen_topo6.py @@ -0,0 +1,63 @@ +#!/usr/bin/env python + +# Copyright 2012 Johns Hopkins University (author: Daniel Povey) + +# Generate a topology file. This allows control of the number of states in the +# non-silence HMMs, and in the silence HMMs. This is a modified version of +# 'utils/gen_topo.pl' that generates a different type of topology, one that we +# believe should be useful in the 'chain' model. Note: right now it doesn't +# have any real options, and it treats silence and nonsilence the same. The +# intention is that you write different versions of this script, or add options, +# if you experiment with it. + +from __future__ import print_function +import argparse + + +parser = argparse.ArgumentParser(description="Usage: steps/nnet3/chain/gen_topo.py " + " " + "e.g.: steps/nnet3/chain/gen_topo.pl 4:5:6:7:8:9:10 1:2:3\n", + epilog="See egs/swbd/s5c/local/chain/train_tdnn_a.sh for example of usage."); +parser.add_argument("nonsilence_phones", type=str, + help="List of non-silence phones as integers, separated by colons, e.g. 4:5:6:7:8:9"); +parser.add_argument("silence_phones", type=str, + help="List of silence phones as integers, separated by colons, e.g. 1:2:3"); + +args = parser.parse_args() + +silence_phones = [ int(x) for x in args.silence_phones.split(":") ] +nonsilence_phones = [ int(x) for x in args.nonsilence_phones.split(":") ] + +print("") +print("") +print("") +print(" ".join([str(x) for x in nonsilence_phones])) +print("") +# The next two lines may look like a bug, but they are as intended. State 0 has +# no self-loop, it happens exactly once. And it can go either to state 1 (with +# a self-loop) or to state 2, so we can have zero or more instances of state 1 +# following state 0. +# We make the transition-probs 0.5 so they normalize, to keep the code happy. +# In fact, we always set the transition probability scale to 0.0 in the 'chain' +# code, so they are never used. +print(" 0 0 1 0.5 2 0.5 ") +print(" 1 1 1 0.5 2 0.5 ") +print(" 2 2 3 0.5 4 0.5 ") +print(" 3 3 3 0.5 4 0.5 ") +print(" 4 4 5 0.5 6 0.5 ") +print(" 5 5 5 0.5 6 0.5 ") +print(" 6 ") +print("") + +print("") +print("") +print(" ".join([str(x) for x in silence_phones])) +print("") +print(" 0 0 0 0.25 1 0.25 2 0.25 3 0.25 ") +print(" 1 1 1 0.25 2 0.25 3 0.25 4 0.25 ") +print(" 2 2 1 0.25 2 0.25 3 0.25 4 0.25 ") +print(" 3 3 1 0.25 2 0.25 3 0.25 4 0.25 ") +print(" 4 4 4 0.75 5 0.25 ") +print(" 5 ") +print("") +print("") diff --git a/egs/wsj/s5/steps/nnet3/chain/get_egs.sh b/egs/wsj/s5/steps/nnet3/chain/get_egs.sh new file mode 100755 index 00000000000..d3112752856 --- /dev/null +++ b/egs/wsj/s5/steps/nnet3/chain/get_egs.sh @@ -0,0 +1,441 @@ +#!/bin/bash + +# Copyright 2012-2015 Johns Hopkins University (Author: Daniel Povey). Apache 2.0. +# +# This script, which will generally be called from other neural-net training +# scripts, extracts the training examples used to train the 'chain' system +# (and also the validation examples used for diagnostics), and puts them in +# separate archives. +# +# This script dumps egs with many frames of labels, controlled by the +# frames_per_eg config variable (default: 25), plus left and right context. +# Because CTC training involves alignment of data, we can't meaningfully train +# frame by frame. The supervision approach involves the time alignment, though-- +# it is just applied in a loose way, where each symbol can appear in the +# frame-range that it was in in the alignment, extended by a certain margin. +# + + +# Begin configuration section. +cmd=run.pl +feat_type=raw # set it to 'lda' to use LDA features. +frames_per_eg=25 # number of feature frames example (not counting added context). + # more->less disk space and less time preparing egs, but more + # I/O during training. note: the script may reduce this if + # reduce_frames_per_eg is true. +frames_overlap_per_eg=0 # number of supervised frames of overlap that we aim for per eg. + # can be useful to avoid wasted data if you're using --left-deriv-truncate + # and --right-deriv-truncate. +cut_zero_frames=-1 # if activated, activates new-style derivative weights.. i'll reorganize + # this if it works well. +frame_subsampling_factor=3 # frames-per-second of features we train on divided + # by frames-per-second at output of chain model +alignment_subsampling_factor=3 # frames-per-second of input alignments divided + # by frames-per-second at output of chain model +left_context=4 # amount of left-context per eg (i.e. extra frames of input features + # not present in the output supervision). +right_context=4 # amount of right-context per eg. +valid_left_context= # amount of left_context for validation egs, typically used in + # recurrent architectures to ensure matched condition with + # training egs +valid_right_context= # amount of right_context for validation egs +compress=true # set this to false to disable compression (e.g. if you want to see whether + # results are affected). + +num_utts_subset=300 # number of utterances in validation and training + # subsets used for shrinkage and diagnostics. +num_valid_egs_combine=0 # #validation examples for combination weights at the very end. +num_train_egs_combine=1000 # number of train examples for the above. +num_egs_diagnostic=400 # number of frames for "compute_prob" jobs +frames_per_iter=400000 # each iteration of training, see this many frames per + # job, measured at the sampling rate of the features + # used. This is just a guideline; it will pick a number + # that divides the number of samples in the entire data. + +right_tolerance= #CTC right tolerance == max label delay. +left_tolerance= + +transform_dir= # If supplied, overrides latdir as the place to find fMLLR transforms + +stage=0 +nj=15 # This should be set to the maximum number of jobs you are + # comfortable to run in parallel; you can increase it if your disk + # speed is greater and you have more machines. +max_shuffle_jobs_run=100 # the shuffle jobs now include the nnet3-chain-normalize-egs command, + # which is fairly CPU intensive, so we can run quite a few at once + # without overloading the disks. +online_ivector_dir= # can be used if we are including speaker information as iVectors. +cmvn_opts= # can be used for specifying CMVN options, if feature type is not lda (if lda, + # it doesn't make sense to use different options than were used as input to the + # LDA transform). This is used to turn off CMVN in the online-nnet experiments. + +echo "$0 $@" # Print the command line for logging + +if [ -f path.sh ]; then . ./path.sh; fi +. parse_options.sh || exit 1; + + +if [ $# != 4 ]; then + echo "Usage: $0 [opts] " + echo " e.g.: $0 data/train exp/tri4_nnet exp/tri3_lats exp/tri4_nnet/egs" + echo "" + echo "From , 0.trans_mdl (the transition-model), tree (the tree)" + echo "and normalization.fst (the normalization FST, derived from the denominator FST)" + echo "are read." + echo "" + echo "Main options (for others, see top of script file)" + echo " --config # config file containing options" + echo " --nj # The maximum number of jobs you want to run in" + echo " # parallel (increase this only if you have good disk and" + echo " # network speed). default=6" + echo " --cmd (utils/run.pl;utils/queue.pl ) # how to run jobs." + echo " --frames-per-iter <#samples;400000> # Number of frames of data to process per iteration, per" + echo " # process." + echo " --feat-type # (raw is the default). The feature type you want" + echo " # to use as input to the neural net." + echo " --frame-subsampling-factor # factor by which num-frames at nnet output is reduced " + echo " --frames-per-eg # number of supervised frames per eg on disk" + echo " --frames-overlap-per-eg # number of supervised frames of overlap between egs" + echo " --left-context # Number of frames on left side to append for feature input" + echo " --right-context # Number of frames on right side to append for feature input" + echo " --num-egs-diagnostic <#frames;4000> # Number of egs used in computing (train,valid) diagnostics" + echo " --num-valid-egs-combine <#frames;10000> # Number of egss used in getting combination weights at the" + echo " # very end." + echo " --stage # Used to run a partially-completed training process from somewhere in" + echo " # the middle." + + exit 1; +fi + +data=$1 +chaindir=$2 +latdir=$3 +dir=$4 + +# Check some files. +[ ! -z "$online_ivector_dir" ] && \ + extra_files="$online_ivector_dir/ivector_online.scp $online_ivector_dir/ivector_period" + +for f in $data/feats.scp $latdir/lat.1.gz $latdir/final.mdl \ + $chaindir/{0.trans_mdl,tree,normalization.fst} $extra_files; do + [ ! -f $f ] && echo "$0: no such file $f" && exit 1; +done + +sdata=$data/split$nj +utils/split_data.sh $data $nj + +mkdir -p $dir/log $dir/info + +num_lat_jobs=$(cat $latdir/num_jobs) || exit 1; + +# Get list of validation utterances. + +frame_shift=$(utils/data/get_frame_shift.sh $data) +utils/data/get_utt2dur.sh $data + +cat $data/utt2dur | \ + awk -v min_len=$frames_per_eg -v fs=$frame_shift '{if ($2 * 1/fs >= min_len) print $1}' | \ + utils/shuffle_list.pl | head -$num_utts_subset > $dir/valid_uttlist || exit 1; + +len_uttlist=`wc -l $dir/valid_uttlist | awk '{print $1}'` +if [ $len_uttlist -lt $num_utts_subset ]; then + echo "Number of utterances which have length at least $frames_per_eg is really low. Please check your data." && exit 1; +fi + +if [ -f $data/utt2uniq ]; then # this matters if you use data augmentation. + # because of this stage we can again have utts with lengths less than + # frames_per_eg + echo "File $data/utt2uniq exists, so augmenting valid_uttlist to" + echo "include all perturbed versions of the same 'real' utterances." + mv $dir/valid_uttlist $dir/valid_uttlist.tmp + utils/utt2spk_to_spk2utt.pl $data/utt2uniq > $dir/uniq2utt + cat $dir/valid_uttlist.tmp | utils/apply_map.pl $data/utt2uniq | \ + sort | uniq | utils/apply_map.pl $dir/uniq2utt | \ + awk '{for(n=1;n<=NF;n++) print $n;}' | sort > $dir/valid_uttlist + rm $dir/uniq2utt $dir/valid_uttlist.tmp +fi + +cat $data/utt2dur | \ + awk -v min_len=$frames_per_eg -v fs=$frame_shift '{if ($2 * 1/fs >= min_len) print $1}' | \ + utils/filter_scp.pl --exclude $dir/valid_uttlist | \ + utils/shuffle_list.pl | head -$num_utts_subset > $dir/train_subset_uttlist || exit 1; +len_uttlist=`wc -l $dir/train_subset_uttlist | awk '{print $1}'` +if [ $len_uttlist -lt $num_utts_subset ]; then + echo "Number of utterances which have length at least $frames_per_eg is really low. Please check your data." && exit 1; +fi + +[ -z "$transform_dir" ] && transform_dir=$latdir + +# because we'll need the features with a different number of jobs than $latdir, +# copy to ark,scp. +if [ -f $transform_dir/trans.1 ] && [ $feat_type != "raw" ]; then + echo "$0: using transforms from $transform_dir" + if [ $stage -le 0 ]; then + $cmd $dir/log/copy_transforms.log \ + copy-feats "ark:cat $transform_dir/trans.* |" "ark,scp:$dir/trans.ark,$dir/trans.scp" + fi +fi +if [ -f $transform_dir/raw_trans.1 ] && [ $feat_type == "raw" ]; then + echo "$0: using raw transforms from $transform_dir" + if [ $stage -le 0 ]; then + $cmd $dir/log/copy_transforms.log \ + copy-feats "ark:cat $transform_dir/raw_trans.* |" "ark,scp:$dir/trans.ark,$dir/trans.scp" + fi +fi + + + +## Set up features. +echo "$0: feature type is $feat_type" + +case $feat_type in + raw) feats="ark,s,cs:utils/filter_scp.pl --exclude $dir/valid_uttlist $sdata/JOB/feats.scp | apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:- ark:- |" + valid_feats="ark,s,cs:utils/filter_scp.pl $dir/valid_uttlist $data/feats.scp | apply-cmvn $cmvn_opts --utt2spk=ark:$data/utt2spk scp:$data/cmvn.scp scp:- ark:- |" + train_subset_feats="ark,s,cs:utils/filter_scp.pl $dir/train_subset_uttlist $data/feats.scp | apply-cmvn $cmvn_opts --utt2spk=ark:$data/utt2spk scp:$data/cmvn.scp scp:- ark:- |" + echo $cmvn_opts >$dir/cmvn_opts # caution: the top-level nnet training script should copy this to its own dir now. + ;; + lda) + splice_opts=`cat $latdir/splice_opts 2>/dev/null` + # caution: the top-level nnet training script should copy these to its own dir now. + cp $latdir/{splice_opts,cmvn_opts,final.mat} $dir || exit 1; + [ ! -z "$cmvn_opts" ] && \ + echo "You cannot supply --cmvn-opts option if feature type is LDA." && exit 1; + cmvn_opts=$(cat $dir/cmvn_opts) + feats="ark,s,cs:utils/filter_scp.pl --exclude $dir/valid_uttlist $sdata/JOB/feats.scp | apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:- ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $dir/final.mat ark:- ark:- |" + valid_feats="ark,s,cs:utils/filter_scp.pl $dir/valid_uttlist $data/feats.scp | apply-cmvn $cmvn_opts --utt2spk=ark:$data/utt2spk scp:$data/cmvn.scp scp:- ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $dir/final.mat ark:- ark:- |" + train_subset_feats="ark,s,cs:utils/filter_scp.pl $dir/train_subset_uttlist $data/feats.scp | apply-cmvn $cmvn_opts --utt2spk=ark:$data/utt2spk scp:$data/cmvn.scp scp:- ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $dir/final.mat ark:- ark:- |" + ;; + *) echo "$0: invalid feature type --feat-type '$feat_type'" && exit 1; +esac + +if [ -f $dir/trans.scp ]; then + feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk scp:$dir/trans.scp ark:- ark:- |" + valid_feats="$valid_feats transform-feats --utt2spk=ark:$data/utt2spk scp:$dir/trans.scp ark:- ark:- |" + train_subset_feats="$train_subset_feats transform-feats --utt2spk=ark:$data/utt2spk scp:$dir/trans.scp ark:- ark:- |" +fi + +if [ ! -z "$online_ivector_dir" ]; then + ivector_dim=$(feat-to-dim scp:$online_ivector_dir/ivector_online.scp -) || exit 1; + echo $ivector_dim > $dir/info/ivector_dim + ivector_period=$(cat $online_ivector_dir/ivector_period) || exit 1; + + ivector_opt="--ivectors='ark,s,cs:utils/filter_scp.pl $sdata/JOB/utt2spk $online_ivector_dir/ivector_online.scp | subsample-feats --n=-$ivector_period scp:- ark:- |'" + valid_ivector_opt="--ivectors='ark,s,cs:utils/filter_scp.pl $dir/valid_uttlist $online_ivector_dir/ivector_online.scp | subsample-feats --n=-$ivector_period scp:- ark:- |'" + train_subset_ivector_opt="--ivectors='ark,s,cs:utils/filter_scp.pl $dir/train_subset_uttlist $online_ivector_dir/ivector_online.scp | subsample-feats --n=-$ivector_period scp:- ark:- |'" +else + echo 0 >$dir/info/ivector_dim +fi + +if [ $stage -le 1 ]; then + echo "$0: working out number of frames of training data" + num_frames=$(steps/nnet2/get_num_frames.sh $data) + echo $num_frames > $dir/info/num_frames + echo "$0: working out feature dim" + feats_one="$(echo $feats | sed s/JOB/1/g)" + feat_dim=$(feat-to-dim "$feats_one" -) || exit 1; + echo $feat_dim > $dir/info/feat_dim +else + num_frames=$(cat $dir/info/num_frames) || exit 1; + feat_dim=$(cat $dir/info/feat_dim) || exit 1; +fi + +# the + 1 is to round up, not down... we assume it doesn't divide exactly. +num_archives=$[$num_frames/$frames_per_iter+1] + +# We may have to first create a smaller number of larger archives, with number +# $num_archives_intermediate, if $num_archives is more than the maximum number +# of open filehandles that the system allows per process (ulimit -n). +max_open_filehandles=$(ulimit -n) || exit 1 +num_archives_intermediate=$num_archives +archives_multiple=1 +while [ $[$num_archives_intermediate+4] -gt $max_open_filehandles ]; do + archives_multiple=$[$archives_multiple+1] + num_archives_intermediate=$[$num_archives/$archives_multiple] || exit 1; +done +# now make sure num_archives is an exact multiple of archives_multiple. +num_archives=$[$archives_multiple*$num_archives_intermediate] || exit 1; + +echo $num_archives >$dir/info/num_archives +echo $frames_per_eg >$dir/info/frames_per_eg +# Work out the number of egs per archive +egs_per_archive=$[$num_frames/($frames_per_eg*$num_archives)] || exit 1; +! [ $egs_per_archive -le $frames_per_iter ] && \ + echo "$0: script error: egs_per_archive=$egs_per_archive not <= frames_per_iter=$frames_per_iter" \ + && exit 1; + +echo $egs_per_archive > $dir/info/egs_per_archive + +echo "$0: creating $num_archives archives, each with $egs_per_archive egs, with" +echo "$0: $frames_per_eg labels per example, and (left,right) context = ($left_context,$right_context)" +echo $num_archives + +if [ -e $dir/storage ]; then + # Make soft links to storage directories, if distributing this way.. See + # utils/create_split_dir.pl. + echo "$0: creating data links" + utils/create_data_link.pl $(for x in $(seq $num_archives); do echo $dir/cegs.$x.ark; done) + for x in $(seq $num_archives_intermediate); do + utils/create_data_link.pl $(for y in $(seq $nj); do echo $dir/cegs_orig.$y.$x.ark; done) + done +fi + +if [ $stage -le 2 ]; then + echo "$0: copying training lattices" + + $cmd --max-jobs-run 6 JOB=1:$num_lat_jobs $dir/log/lattice_copy.JOB.log \ + lattice-copy "ark:gunzip -c $latdir/lat.JOB.gz|" ark,scp:$dir/lat.JOB.ark,$dir/lat.JOB.scp || exit 1; + + for id in $(seq $num_lat_jobs); do cat $dir/lat.$id.scp; done > $dir/lat.scp +fi + + +egs_opts="--left-context=$left_context --right-context=$right_context --num-frames=$frames_per_eg --num-frames-overlap=$frames_overlap_per_eg --frame-subsampling-factor=$frame_subsampling_factor --compress=$compress --cut-zero-frames=$cut_zero_frames" + + +[ -z $valid_left_context ] && valid_left_context=$left_context; +[ -z $valid_right_context ] && valid_right_context=$right_context; +# don't do the overlap thing for the validation data. +valid_egs_opts="--left-context=$valid_left_context --right-context=$valid_right_context --num-frames=$frames_per_eg --frame-subsampling-factor=$frame_subsampling_factor --compress=$compress" + +ctc_supervision_all_opts="--lattice-input=true --frame-subsampling-factor=$alignment_subsampling_factor" +[ ! -z $right_tolerance ] && \ + ctc_supervision_all_opts="$ctc_supervision_all_opts --right-tolerance=$right_tolerance" + +[ ! -z $left_tolerance ] && \ + ctc_supervision_all_opts="$ctc_supervision_all_opts --left-tolerance=$left_tolerance" + +echo $left_context > $dir/info/left_context +echo $right_context > $dir/info/right_context + +if [ $stage -le 3 ]; then + echo "$0: Getting validation and training subset examples." + rm $dir/.error 2>/dev/null + echo "$0: ... extracting validation and training-subset alignments." + + utils/filter_scp.pl <(cat $dir/valid_uttlist $dir/train_subset_uttlist) \ + <$dir/lat.scp >$dir/lat_special.scp + + $cmd $dir/log/create_valid_subset.log \ + lattice-align-phones --replace-output-symbols=true $latdir/final.mdl scp:$dir/lat_special.scp ark:- \| \ + chain-get-supervision $ctc_supervision_all_opts $chaindir/tree $chaindir/0.trans_mdl \ + ark:- ark:- \| \ + nnet3-chain-get-egs $valid_ivector_opt $valid_egs_opts $chaindir/normalization.fst \ + "$valid_feats" ark,s,cs:- "ark:$dir/valid_all.cegs" || touch $dir/.error & + $cmd $dir/log/create_train_subset.log \ + lattice-align-phones --replace-output-symbols=true $latdir/final.mdl scp:$dir/lat_special.scp ark:- \| \ + chain-get-supervision $ctc_supervision_all_opts \ + $chaindir/tree $chaindir/0.trans_mdl ark:- ark:- \| \ + nnet3-chain-get-egs $train_subset_ivector_opt $valid_egs_opts $chaindir/normalization.fst \ + "$train_subset_feats" ark,s,cs:- "ark:$dir/train_subset_all.cegs" || touch $dir/.error & + wait; + [ -f $dir/.error ] && echo "Error detected while creating train/valid egs" && exit 1 + echo "... Getting subsets of validation examples for diagnostics and combination." + $cmd $dir/log/create_valid_subset_combine.log \ + nnet3-chain-subset-egs --n=$num_valid_egs_combine ark:$dir/valid_all.cegs \ + ark:$dir/valid_combine.cegs || touch $dir/.error & + $cmd $dir/log/create_valid_subset_diagnostic.log \ + nnet3-chain-subset-egs --n=$num_egs_diagnostic ark:$dir/valid_all.cegs \ + ark:$dir/valid_diagnostic.cegs || touch $dir/.error & + + $cmd $dir/log/create_train_subset_combine.log \ + nnet3-chain-subset-egs --n=$num_train_egs_combine ark:$dir/train_subset_all.cegs \ + ark:$dir/train_combine.cegs || touch $dir/.error & + $cmd $dir/log/create_train_subset_diagnostic.log \ + nnet3-chain-subset-egs --n=$num_egs_diagnostic ark:$dir/train_subset_all.cegs \ + ark:$dir/train_diagnostic.cegs || touch $dir/.error & + wait + sleep 5 # wait for file system to sync. + cat $dir/valid_combine.cegs $dir/train_combine.cegs > $dir/combine.cegs + + for f in $dir/{combine,train_diagnostic,valid_diagnostic}.cegs; do + [ ! -s $f ] && echo "No examples in file $f" && exit 1; + done + rm $dir/valid_all.cegs $dir/train_subset_all.cegs $dir/{train,valid}_combine.cegs +fi + +if [ $stage -le 4 ]; then + # create cegs_orig.*.*.ark; the first index goes to $nj, + # the second to $num_archives_intermediate. + + egs_list= + for n in $(seq $num_archives_intermediate); do + egs_list="$egs_list ark:$dir/cegs_orig.JOB.$n.ark" + done + echo "$0: Generating training examples on disk" + + # The examples will go round-robin to egs_list. Note: we omit the + # 'normalization.fst' argument while creating temporary egs: the phase of egs + # preparation that involves the normalization FST is quite CPU-intensive and + # it's more convenient to do it later, in the 'shuffle' stage. Otherwise to + # make it efficient we need to use a large 'nj', like 40, and in that case + # there can be too many small files to deal with, because the total number of + # files is the product of 'nj' by 'num_archives_intermediate', which might be + # quite large. + $cmd JOB=1:$nj $dir/log/get_egs.JOB.log \ + utils/filter_scp.pl $sdata/JOB/utt2spk $dir/lat.scp \| \ + lattice-align-phones --replace-output-symbols=true $latdir/final.mdl scp:- ark:- \| \ + chain-get-supervision $ctc_supervision_all_opts \ + $chaindir/tree $chaindir/0.trans_mdl ark:- ark:- \| \ + nnet3-chain-get-egs $ivector_opt $egs_opts \ + "$feats" ark,s,cs:- ark:- \| \ + nnet3-chain-copy-egs --random=true --srand=JOB ark:- $egs_list || exit 1; +fi + +if [ $stage -le 5 ]; then + echo "$0: recombining and shuffling order of archives on disk" + # combine all the "egs_orig.*.JOB.scp" (over the $nj splits of the data) and + # shuffle the order, writing to the egs.JOB.ark + + # the input is a concatenation over the input jobs. + egs_list= + for n in $(seq $nj); do + egs_list="$egs_list $dir/cegs_orig.$n.JOB.ark" + done + + if [ $archives_multiple == 1 ]; then # normal case. + $cmd --max-jobs-run $max_shuffle_jobs_run --mem 8G JOB=1:$num_archives_intermediate $dir/log/shuffle.JOB.log \ + nnet3-chain-normalize-egs $chaindir/normalization.fst "ark:cat $egs_list|" ark:- \| \ + nnet3-chain-shuffle-egs --srand=JOB ark:- ark:$dir/cegs.JOB.ark || exit 1; + else + # we need to shuffle the 'intermediate archives' and then split into the + # final archives. we create soft links to manage this splitting, because + # otherwise managing the output names is quite difficult (and we don't want + # to submit separate queue jobs for each intermediate archive, because then + # the --max-jobs-run option is hard to enforce). + output_archives="$(for y in $(seq $archives_multiple); do echo ark:$dir/cegs.JOB.$y.ark; done)" + for x in $(seq $num_archives_intermediate); do + for y in $(seq $archives_multiple); do + archive_index=$[($x-1)*$archives_multiple+$y] + # egs.intermediate_archive.{1,2,...}.ark will point to egs.archive.ark + ln -sf cegs.$archive_index.ark $dir/cegs.$x.$y.ark || exit 1 + done + done + $cmd --max-jobs-run $max_shuffle_jobs_run --mem 8G JOB=1:$num_archives_intermediate $dir/log/shuffle.JOB.log \ + nnet3-chain-normalize-egs $chaindir/normalization.fst "ark:cat $egs_list|" ark:- \| \ + nnet3-chain-shuffle-egs --srand=JOB ark:- ark:- \| \ + nnet3-chain-copy-egs ark:- $output_archives || exit 1; + fi +fi + +if [ $stage -le 6 ]; then + echo "$0: removing temporary archives" + ( + cd $dir + for f in $(ls -l . | grep 'cegs_orig' | awk '{ X=NF-1; Y=NF-2; if ($X == "->") print $Y, $NF; }'); do rm $f; done + # the next statement removes them if we weren't using the soft links to a + # 'storage' directory. + rm cegs_orig.*.ark 2>/dev/null + ) + if [ $archives_multiple -gt 1 ]; then + # there are some extra soft links that we should delete. + for f in $dir/cegs.*.*.ark; do rm $f; done + fi + echo "$0: removing temporary lattices" + rm $dir/lat.* + echo "$0: removing temporary alignments and transforms" + # Ignore errors below because trans.* might not exist. + rm $dir/{ali,trans}.{ark,scp} 2>/dev/null + +fi + +echo "$0: Finished preparing training examples" diff --git a/egs/wsj/s5/steps/nnet3/chain/nnet3_chain_lib.py b/egs/wsj/s5/steps/nnet3/chain/nnet3_chain_lib.py new file mode 100644 index 00000000000..d6819e25060 --- /dev/null +++ b/egs/wsj/s5/steps/nnet3/chain/nnet3_chain_lib.py @@ -0,0 +1,245 @@ + + +# Copyright 2016 Vijayaditya Peddinti. +# Apache 2.0. + + +import subprocess +import logging +import math +import re +import time +import imp +import os + +train_lib = imp.load_source('ntl', 'steps/nnet3/nnet3_train_lib.py') + +logger = logging.getLogger(__name__) +logger.setLevel(logging.INFO) +handler = logging.StreamHandler() +handler.setLevel(logging.INFO) +formatter = logging.Formatter('%(asctime)s [%(filename)s:%(lineno)s - %(funcName)s - %(levelname)s ] %(message)s') +handler.setFormatter(formatter) +logger.addHandler(handler) + +def GetNumberOfLeaves(dir): + [stdout, stderr] = train_lib.RunKaldiCommand("am-info {0}/final.mdl 2>/dev/null | grep -w pdfs".format(dir)) + parts = stdout.split() + #number of pdfs 7115 + assert(' '.join(parts[0:3]) == "number of pdfs") + num_leaves = int(parts[3]) + if num_leaves == 0: + raise Exception("Number of leaves is 0") + return num_leaves + +def CreatePhoneLm(dir, tree_dir, run_opts, lm_opts = None): + train_lib.RunKaldiCommand(""" + {command} {dir}/log/make_phone_lm.log \ + chain-est-phone-lm {lm_opts} \ + "ark:gunzip -c {tree_dir}/ali.*.gz | ali-to-phones {tree_dir}/final.mdl ark:- ark:- |" \ + {dir}/phone_lm.fst + """.format(command = run_opts.command, + dir = dir, + lm_opts = lm_opts if lm_opts is not None else '', + tree_dir = tree_dir)) + +def CreateDenominatorFst(dir, tree_dir, run_opts): + train_lib.RunKaldiCommand(""" + copy-transition-model {tree_dir}/final.mdl {dir}/0.trans_mdl + {command} {dir}/log/make_den_fst.log \ + chain-make-den-fst {dir}/tree {dir}/0.trans_mdl {dir}/phone_lm.fst \ + {dir}/den.fst {dir}/normalization.fst""".format( + tree_dir = tree_dir, dir = dir, command = run_opts.command)) + +def GenerateChainEgs(dir, data, lat_dir, egs_dir, + left_context, right_context, + run_opts, stage = 0, + valid_left_context = None, valid_right_context = None, + left_tolerance = None, right_tolerance = None, + frame_subsampling_factor = 3, + alignment_subsampling_factor = 3, + feat_type = 'raw', online_ivector_dir = None, + frames_per_iter = 20000, frames_per_eg = 20, + egs_opts = None, cmvn_opts = None, transform_dir = None): + + train_lib.RunKaldiCommand(""" +steps/nnet3/chain/get_egs.sh {egs_opts} \ + --cmd "{command}" \ + --cmvn-opts "{cmvn_opts}" \ + --feat-type {feat_type} \ + --transform-dir "{transform_dir}" \ + --online-ivector-dir "{ivector_dir}" \ + --left-context {left_context} --right-context {right_context} \ + --valid-left-context '{valid_left_context}' \ + --valid-right-context '{valid_right_context}' \ + --left-tolerance '{left_tolerance}' \ + --right-tolerance '{right_tolerance}' \ + --frame-subsampling-factor {frame_subsampling_factor} \ + --alignment-subsampling-factor {alignment_subsampling_factor} \ + --stage {stage} \ + --frames-per-iter {frames_per_iter} \ + --frames-per-eg {frames_per_eg} \ + {data} {dir} {lat_dir} {egs_dir} + """.format(command = run_opts.command, + cmvn_opts = cmvn_opts if cmvn_opts is not None else '', + feat_type = feat_type, + transform_dir = transform_dir if transform_dir is not None else '', + ivector_dir = online_ivector_dir if online_ivector_dir is not None else '', + left_context = left_context, right_context = right_context, + valid_left_context = valid_left_context if valid_left_context is not None else '', + valid_right_context = valid_right_context if valid_right_context is not None else '', + left_tolerance = left_tolerance if left_tolerance is not None else '', + right_tolerance = right_tolerance if right_tolerance is not None else '', + frame_subsampling_factor = frame_subsampling_factor, + alignment_subsampling_factor = alignment_subsampling_factor, + stage = stage, frames_per_iter = frames_per_iter, + frames_per_eg = frames_per_eg, + data = data, lat_dir = lat_dir, dir = dir, egs_dir = egs_dir, + egs_opts = egs_opts if egs_opts is not None else '' )) + +# this function is exactly similar to the version in nnet3_train_lib.py +# except it uses egs files in place of cegs files +def ComputePreconditioningMatrix(dir, egs_dir, num_lda_jobs, run_opts, + max_lda_jobs = None, rand_prune = 4.0, + lda_opts = None): + if max_lda_jobs is not None: + if num_lda_jobs > max_lda_jobs: + num_lda_jobs = max_lda_jobs + + + # Write stats with the same format as stats for LDA. + train_lib.RunKaldiCommand(""" +{command} JOB=1:{num_lda_jobs} {dir}/log/get_lda_stats.JOB.log \ + nnet3-chain-acc-lda-stats --rand-prune={rand_prune} \ + {dir}/init.raw "ark:{egs_dir}/cegs.JOB.ark" {dir}/JOB.lda_stats""".format( + command = run_opts.command, + num_lda_jobs = num_lda_jobs, + dir = dir, + egs_dir = egs_dir, + rand_prune = rand_prune)) + + # the above command would have generated dir/{1..num_lda_jobs}.lda_stats + lda_stat_files = map(lambda x: '{0}/{1}.lda_stats'.format(dir, x), + range(1, num_lda_jobs + 1)) + + train_lib.RunKaldiCommand(""" +{command} {dir}/log/sum_transform_stats.log \ + sum-lda-accs {dir}/lda_stats {lda_stat_files}""".format( + command = run_opts.command, + dir = dir, lda_stat_files = " ".join(lda_stat_files))) + + for file in lda_stat_files: + try: + os.remove(file) + except OSError: + raise Exception("There was error while trying to remove lda stat files.") + # this computes a fixed affine transform computed in the way we described in + # Appendix C.6 of http://arxiv.org/pdf/1410.7455v6.pdf; it's a scaled variant + # of an LDA transform but without dimensionality reduction. + + train_lib.RunKaldiCommand(""" +{command} {dir}/log/get_transform.log \ + nnet-get-feature-transform {lda_opts} {dir}/lda.mat {dir}/lda_stats + """.format(command = run_opts.command,dir = dir, + lda_opts = lda_opts if lda_opts is not None else "")) + + train_lib.ForceSymlink("../lda.mat", "{0}/configs/lda.mat".format(dir)) + +def PrepareInitialAcousticModel(dir, run_opts): + """ Adds the first layer; this will also add in the lda.mat and + presoftmax_prior_scale.vec. It will also prepare the acoustic model + with the transition model.""" + + train_lib.RunKaldiCommand(""" +{command} {dir}/log/add_first_layer.log \ + nnet3-init --srand=-1 {dir}/init.raw {dir}/configs/layer1.config {dir}/0.raw """.format(command = run_opts.command, + dir = dir)) + + # The model-format for a 'chain' acoustic model is just the transition + # model and then the raw nnet, so we can use 'cat' to create this, as + # long as they have the same mode (binary or not binary). + # We ensure that they have the same mode (even if someone changed the + # script to make one or both of them text mode) by copying them both + # before concatenating them. + train_lib.RunKaldiCommand(""" +{command} {dir}/log/init_mdl.log \ + nnet3-am-init {dir}/0.trans_mdl {dir}/0.raw {dir}/0.mdl""".format( + command = run_opts.command, dir = dir)) + +def CombineModels(dir, num_iters, num_iters_combine, num_chunk_per_minibatch, + egs_dir, leaky_hmm_coefficient, l2_regularize, + xent_regularize, run_opts): + # Now do combination. In the nnet3 setup, the logic + # for doing averaging of subsets of the models in the case where + # there are too many models to reliably esetimate interpolation + # factors (max_models_combine) is moved into the nnet3-combine + raw_model_strings = [] + for iter in range(num_iters - num_iters_combine + 1, num_iters + 1): + model_file = '{0}/{1}.mdl'.format(dir, iter) + if not os.path.exists(model_file): + raise Exception('Model file {0} missing'.format(model_file)) + raw_model_strings.append('"nnet3-am-copy --raw=true {0} -|"'.format(model_file)) + train_lib.RunKaldiCommand(""" +{command} {combine_queue_opt} {dir}/log/combine.log \ +nnet3-chain-combine --num-iters=40 \ + --l2-regularize={l2} --leaky-hmm-coefficient={leaky} \ + --enforce-sum-to-one=true --enforce-positive-weights=true \ + --verbose=3 {dir}/den.fst {raw_models} "ark,bg:nnet3-chain-merge-egs --minibatch-size={num_chunk_per_minibatch} ark:{egs_dir}/combine.cegs ark:-|" \ +"|nnet3-am-copy --set-raw-nnet=- {dir}/{num_iters}.mdl {dir}/final.mdl" + """.format(command = run_opts.command, + combine_queue_opt = run_opts.combine_queue_opt, + l2 = l2_regularize, leaky = leaky_hmm_coefficient, + dir = dir, raw_models = " ".join(raw_model_strings), + num_chunk_per_minibatch = num_chunk_per_minibatch, + num_iters = num_iters, + egs_dir = egs_dir)) + + # Compute the probability of the final, combined model with + # the same subset we used for the previous compute_probs, as the + # different subsets will lead to different probs. + ComputeTrainCvProbabilities(dir, 'final', egs_dir, l2_regularize, xent_regularize, leaky_hmm_coefficient, run_opts, wait = False) + +def ComputeTrainCvProbabilities(dir, iter, egs_dir, l2_regularize, xent_regularize, + leaky_hmm_coefficient, run_opts, wait = False): + + model = '{0}/{1}.mdl'.format(dir, iter) + + train_lib.RunKaldiCommand(""" +{command} {dir}/log/compute_prob_valid.{iter}.log \ + nnet3-chain-compute-prob --l2-regularize={l2} --leaky-hmm-coefficient={leaky} \ + --xent-regularize={xent_reg} \ + "nnet3-am-copy --raw=true {model} - |" {dir}/den.fst \ + "ark,bg:nnet3-chain-merge-egs ark:{egs_dir}/valid_diagnostic.cegs ark:- |" + """.format(command = run_opts.command, + dir = dir, iter = iter, model = model, + l2 = l2_regularize, leaky = leaky_hmm_coefficient, + xent_reg = xent_regularize, + egs_dir = egs_dir), wait = wait) + + train_lib.RunKaldiCommand(""" +{command} {dir}/log/compute_prob_train.{iter}.log \ + nnet3-chain-compute-prob --l2-regularize={l2} --leaky-hmm-coefficient={leaky} \ + --xent-regularize={xent_reg} \ + "nnet3-am-copy --raw=true {model} - |" {dir}/den.fst \ + "ark,bg:nnet3-chain-merge-egs ark:{egs_dir}/train_diagnostic.cegs ark:- |" + """.format(command = run_opts.command, + dir = dir, + iter = iter, + model = model, + l2 = l2_regularize, leaky = leaky_hmm_coefficient, + xent_reg = xent_regularize, + egs_dir = egs_dir), wait = wait) + +def ComputeProgress(dir, iter, run_opts, wait=False): + + prev_model = '{0}/{1}.mdl'.format(dir, iter - 1) + model = '{0}/{1}.mdl'.format(dir, iter) + train_lib.RunKaldiCommand(""" +{command} {dir}/log/progress.{iter}.log \ +nnet3-am-info {model} '&&' \ +nnet3-show-progress --use-gpu=no "nnet3-am-copy --raw=true {prev_model} - |" "nnet3-am-copy --raw=true {model} - |" + """.format(command = run_opts.command, + dir = dir, + iter = iter, + model = model, + prev_model = prev_model), wait = wait) diff --git a/egs/wsj/s5/steps/nnet3/chain/train.py b/egs/wsj/s5/steps/nnet3/chain/train.py new file mode 100755 index 00000000000..2c12ee27b45 --- /dev/null +++ b/egs/wsj/s5/steps/nnet3/chain/train.py @@ -0,0 +1,706 @@ +#!/usr/bin/env python + + +# Copyright 2016 Vijayaditya Peddinti. +# Apache 2.0. + + +# this script is based on steps/nnet3/lstm/train.sh + +import os +import subprocess +import argparse +import sys +import pprint +import logging +import imp +import traceback +import shutil +import math + +train_lib = imp.load_source('ntl', 'steps/nnet3/nnet3_train_lib.py') +chain_lib = imp.load_source('ncl', 'steps/nnet3/chain/nnet3_chain_lib.py') +nnet3_log_parse = imp.load_source('nlp', 'steps/nnet3/report/nnet3_log_parse_lib.py') + +logger = logging.getLogger(__name__) +logger.setLevel(logging.INFO) +handler = logging.StreamHandler() +handler.setLevel(logging.INFO) +formatter = logging.Formatter('%(asctime)s [%(filename)s:%(lineno)s - %(funcName)s - %(levelname)s ] %(message)s') +handler.setFormatter(formatter) +logger.addHandler(handler) +logger.info('Starting chain model trainer (train.py)') + + +def GetArgs(): + # we add compulsary arguments as named arguments for readability + parser = argparse.ArgumentParser(description=""" + Trains RNN and DNN acoustic models using the 'chain' objective function. + """, + formatter_class=argparse.ArgumentDefaultsHelpFormatter) + + # feat options + parser.add_argument("--feat.online-ivector-dir", type=str, dest='online_ivector_dir', + default = None, action = train_lib.NullstrToNoneAction, + help="directory with the ivectors extracted in an online fashion.") + parser.add_argument("--feat.cmvn-opts", type=str, dest='cmvn_opts', + default = None, action = train_lib.NullstrToNoneAction, + help="A string specifying '--norm-means' and '--norm-vars' values") + + # egs extraction options + parser.add_argument("--egs.chunk-width", type=int, dest='chunk_width', + default = 150, + help="Number of output labels in each example. Caution: if you double this you should halve --trainer.samples-per-iter.") + parser.add_argument("--egs.chunk-left-context", type=int, dest='chunk_left_context', + default = 0, + help="Number of additional frames of input to the left" + " of the input chunk. This extra context will be used" + " in the estimation of RNN state before prediction of" + " the first label. In the case of FF-DNN this extra" + " context will be used to allow for frame-shifts") + parser.add_argument("--egs.chunk-right-context", type=int, dest='chunk_right_context', + default = 0, + help="Number of additional frames of input to the right" + " of the input chunk. This extra context will be used" + " in the estimation of bidirectional RNN state before" + " prediction of the first label.") + parser.add_argument("--egs.transform_dir", type=str, dest='transform_dir', + default = None, action = train_lib.NullstrToNoneAction, + help="String to provide options directly to steps/nnet3/get_egs.sh script") + parser.add_argument("--egs.dir", type=str, dest='egs_dir', + default = None, action = train_lib.NullstrToNoneAction, + help="Directory with egs. If specified this directory " + "will be used rather than extracting egs") + parser.add_argument("--egs.stage", type=int, dest='egs_stage', + default = -6, help="Stage at which get_egs.sh should be restarted") + parser.add_argument("--egs.opts", type=str, dest='egs_opts', + default = None, action = train_lib.NullstrToNoneAction, + help="String to provide options directly to steps/nnet3/get_egs.sh script") + + # chain options + parser.add_argument("--chain.lm-opts", type=str, dest='lm_opts', + default = None, action = train_lib.NullstrToNoneAction, + help="options to be be passed to chain-est-phone-lm") + parser.add_argument("--chain.l2-regularize", type=float, dest='l2_regularize', + default = 0.0, + help="Weight of regularization function which is the" + " l2-norm of the output of the network. It should be" + " used without the log-softmax layer for the outputs." + " As l2-norm of the log-softmax outputs can dominate" + " the objective function.") + parser.add_argument("--chain.xent-regularize", type=float, dest='xent_regularize', + default = 0.0, + help="Weight of regularization function which is the" + " cross-entropy cost the outputs.") + parser.add_argument("--chain.right-tolerance", type=int, dest='right_tolerance', + default = 5, help="") + parser.add_argument("--chain.left-tolerance", type=int, dest='left_tolerance', + default = 5, help="") + parser.add_argument("--chain.leaky-hmm-coefficient", type=float, dest='leaky_hmm_coefficient', + default = 0.00001, help="") + parser.add_argument("--chain.apply-deriv-weights", type=str, dest='apply_deriv_weights', + default=True, action=train_lib.StrToBoolAction, + choices = ["true", "false"], + help="") + parser.add_argument("--chain.truncate-deriv-weights", type=float, dest='truncate_deriv_weights', + default =0, + help="Can be used to set to zero the weights of derivs" + " from frames near the edges. (counts subsampled frames)") + parser.add_argument("--chain.frame-subsampling-factor", type=int, + dest='frame_subsampling_factor', + default = 3, + help="ratio of frames-per-second of features we train" + " on, to chain model's output") + parser.add_argument("--chain.alignment-subsampling-factor", type=int, + dest='alignment_subsampling_factor', + default = 3, + help="ratio of frames-per-second of input alignments to" + " chain model's output") + parser.add_argument("--chain.ngram-order", type=int, dest='ngram_order', + default = 3, help="") + parser.add_argument("--chain.left-deriv-truncate", type=int, + dest='left_deriv_truncate', + default = None, help="") + parser.add_argument("--chain.right-deriv-truncate", type=int, + dest='right_deriv_truncate', + default = None, help="") + + + # trainer options + parser.add_argument("--trainer.num-epochs", type=int, dest='num_epochs', + default = 10, + help="Number of epochs to train the model") + parser.add_argument("--trainer.prior-subset-size", type=int, dest='prior_subset_size', + default = 20000, + help="Number of samples for computing priors") + parser.add_argument("--trainer.num-jobs-compute-prior", type=int, dest='num_jobs_compute_prior', + default = 10, + help="The prior computation jobs are single threaded and run on the CPU") + parser.add_argument("--trainer.max-models-combine", type=int, dest='max_models_combine', + default = 20, + help="The maximum number of models used in the final" + " model combination stage. These models will themselves" + " be averages of iteration-number ranges") + parser.add_argument("--trainer.shuffle-buffer-size", type=int, dest='shuffle_buffer_size', + default = 5000, + help="Controls randomization of the samples on each" + " iteration. If 0 or a large value the randomization is" + " complete, but this will consume memory and cause spikes" + " in disk I/O. Smaller is easier on disk and memory but" + " less random. It's not a huge deal though, as samples" + " are anyway randomized right at the start. (the point" + " of this is to get data in different minibatches on" + " different iterations, since in the preconditioning" + " method, 2 samples in the same minibatch can affect" + " each others' gradients.") + parser.add_argument("--trainer.add-layers-period", type=int, dest='add_layers_period', + default=2, + help="The number of iterations between adding layers" + " during layer-wise discriminative training.") + parser.add_argument("--trainer.max-param-change", type=float, dest='max_param_change', + default=2.0, + help="The maximum change in parameters allowed per" + " minibatch, measured in Frobenius norm over the entire model") + parser.add_argument("--trainer.frames-per-iter", type=int, dest='frames_per_iter', + default=800000, + help ="Each iteration of training, see this many [input]" + " frames per job. This option is passed to get_egs.sh." + " Aim for about a minute of training time") + parser.add_argument("--trainer.lda.rand-prune", type=float, dest='rand_prune', + default=4.0, + help="Value used in preconditioning matrix estimation") + parser.add_argument("--trainer.lda.max-lda-jobs", type=float, dest='max_lda_jobs', + default=10, + help="Max number of jobs used for LDA stats accumulation") + + # Parameters for the optimization + parser.add_argument("--trainer.optimization.initial-effective-lrate", type=float, dest='initial_effective_lrate', + default = 0.0002, + help="Learning rate used during the initial iteration") + parser.add_argument("--trainer.optimization.final-effective-lrate", type=float, dest='final_effective_lrate', + default = 0.00002, + help="Learning rate used during the final iteration") + parser.add_argument("--trainer.optimization.num-jobs-initial", type=int, dest='num_jobs_initial', + default = 1, + help="Number of neural net jobs to run in parallel at the start of training") + parser.add_argument("--trainer.optimization.num-jobs-final", type=int, dest='num_jobs_final', + default = 8, + help="Number of neural net jobs to run in parallel at" + " the end of training") + parser.add_argument("--trainer.optimization.max-models-combine", type=int, dest='max_models_combine', + default = 20, + help = "The is the maximum number of models we give to" + " the final 'combine' stage, but these models will" + " themselves be averages of iteration-number ranges.") + parser.add_argument("--trainer.optimization.momentum", type=float, dest='momentum', + default = 0.0, + help="Momentum used in update computation." + " Note: we implemented it in such a way that it doesn't" + " increase the effective learning rate.") + parser.add_argument("--trainer.optimization.shrink-value", type=float, dest='shrink_value', + default = 1.0, + help="Scaling factor used for scaling the parameter" + " matrices when the derivative averages are below the" + " shrink-threshold at the non-linearities") + parser.add_argument("--trainer.optimization.shrink-threshold", type=float, dest='shrink_threshold', + default = 0.15, + help="If the derivative averages are below this" + " threshold we scale the parameter matrices with the" + " shrink-value. It is less than 0.25 for sigmoid non-linearities.") + parser.add_argument("--trainer.optimization.shrink-nonlinearity", type=str, dest='shrink_nonlinearity', + default = "SigmoidComponent", choices = ["TanhComponent", "SigmoidComponent"], + help="The non-linear component from which the" + " deriv-avg values are going to used to compute" + " mean-deriv-avg. The mean-deriv-avg is going to be" + " compared with shrink-threshold. Be careful to specify" + " a shrink-threshold which is dependent on the" + " shrink-nonlinearity type") + + # RNN specific trainer options + parser.add_argument("--trainer.num-chunk-per-minibatch", type=int, dest='num_chunk_per_minibatch', + default=512, + help="Number of sequences to be processed in parallel every minibatch" ) + + # General options + parser.add_argument("--stage", type=int, default=-4, + help="Specifies the stage of the experiment to execution from") + parser.add_argument("--exit-stage", type=int, default=None, + help="If specified, training exits before running this stage") + parser.add_argument("--cmd", type=str, action = train_lib.NullstrToNoneAction, dest="command", + help="Specifies the script to launch jobs." + " e.g. queue.pl for launching on SGE cluster run.pl" + " for launching on local machine", default = "queue.pl") + parser.add_argument("--use-gpu", type=str, action = train_lib.StrToBoolAction, + choices = ["true", "false"], + help="Use GPU for training", default=True) + parser.add_argument("--cleanup", type=str, action = train_lib.StrToBoolAction, + choices = ["true", "false"], + help="Clean up models after training", default=True) + parser.add_argument("--cleanup.remove-egs", type=str, dest='remove_egs', + default = True, action = train_lib.StrToBoolAction, + choices = ["true", "false"], + help="If true, remove egs after experiment") + parser.add_argument("--cleanup.preserve-model-interval", dest = "preserve_model_interval", + type=int, default=100, + help="Determines iterations for which models will be preserved during cleanup. If iter % preserve_model_interval == 0 model will be preserved.") + + parser.add_argument("--reporting.email", dest = "email", + type=str, default=None, action = train_lib.NullstrToNoneAction, + help="Email-id to report about the progress of the experiment. NOTE: It assumes the machine on which the script is being run can send emails from command line via. mail program. The Kaldi mailing list will not support this feature. It might require local expertise to setup. ") + parser.add_argument("--reporting.interval", dest = "reporting_interval", + type=int, default=0.1, + help="Frequency with which reports have to be sent, measured in terms of fraction of iterations. If 0 and reporting mail has been specified then only failure notifications are sent") + + parser.add_argument("--feat-dir", type=str, required = True, + help="Directory with features used for training the neural network.") + parser.add_argument("--tree-dir", type=str, required = True, + help="Languade directory") + parser.add_argument("--lat-dir", type=str, required = True, + help="Directory with alignments used for training the neural network.") + parser.add_argument("--dir", type=str, required = True, + help="Directory to store the models and all other files.") + + print(' '.join(sys.argv)) + print(sys.argv) + + args = parser.parse_args() + + [args, run_opts] = ProcessArgs(args) + + return [args, run_opts] + +def ProcessArgs(args): + # process the options + if args.chunk_width < 1: + raise Exception("--egs.chunk-width should have a minimum value of 1") + + if args.chunk_left_context < 0: + raise Exception("--egs.chunk-left-context should be non-negative") + + if args.chunk_right_context < 0: + raise Exception("--egs.chunk-right-context should be non-negative") + + if (not os.path.exists(args.dir)) or (not os.path.exists(args.dir+"/configs")): + raise Exception("""This scripts expects {0} to exist and have a configs + directory which is the output of make_configs.py script""") + + if args.transform_dir is None: + args.transform_dir = args.lat_dir + # set the options corresponding to args.use_gpu + run_opts = RunOpts() + if args.use_gpu: + if not train_lib.CheckIfCudaCompiled(): + logger.warning(""" + You are running with one thread but you have not compiled + for CUDA. You may be running a setup optimized for GPUs. If you have + GPUs and have nvcc installed, go to src/ and do ./configure; make""") + + run_opts.train_queue_opt = "--gpu 1" + run_opts.parallel_train_opts = "" + run_opts.combine_queue_opt = "--gpu 1" + + else: + logger.warning(""" + Without using a GPU this will be very slow. nnet3 does not yet support multiple threads.""") + + run_opts.train_queue_opt = "" + run_opts.parallel_train_opts = "--use-gpu=no" + run_opts.combine_queue_opt = "" + + run_opts.command = args.command + + return [args, run_opts] + +# a class to store run options +class RunOpts: + def __init__(self): + self.command = None + self.train_queue_opt = None + self.combine_queue_opt = None + self.parallel_train_opts = None + + +def TrainNewModels(dir, iter, num_jobs, num_archives_processed, num_archives, + raw_model_string, egs_dir, + apply_deriv_weights, + left_deriv_truncate, right_deriv_truncate, + l2_regularize, xent_regularize, leaky_hmm_coefficient, + momentum, max_param_change, + shuffle_buffer_size, num_chunk_per_minibatch, + frame_subsampling_factor, truncate_deriv_weights, + cache_io_opts, run_opts): + # We cannot easily use a single parallel SGE job to do the main training, + # because the computation of which archive and which --frame option + # to use for each job is a little complex, so we spawn each one separately. + # this is no longer true for RNNs as we use do not use the --frame option + # but we use the same script for consistency with FF-DNN code + + deriv_time_opts="" + if left_deriv_truncate is not None: + deriv_time_opts += " --optimization.min-deriv-time={0}".format(left_deriv_truncate) + if right_deriv_truncate is not None: + deriv_time_opts += " --optimization.max-deriv-time={0}".format(int(chunk-width-right_deriv_truncate)) + + processes = [] + for job in range(1,num_jobs+1): + k = num_archives_processed + job - 1 # k is a zero-based index that we will derive + # the other indexes from. + archive_index = (k % num_archives) + 1 # work out the 1-based archive index. + frame_shift = (archive_index + k/num_archives) % frame_subsampling_factor + # previous : frame_shift = (k/num_archives) % frame_subsampling_factor + if job == 1: + cur_cache_io_opts = cache_io_opts + " --write-cache={dir}/cache.{next_iter}".format(dir = dir, next_iter = iter + 1) + else: + cur_cache_io_opts = cache_io_opts + + process_handle = train_lib.RunKaldiCommand(""" +{command} {train_queue_opt} {dir}/log/train.{iter}.{job}.log \ + nnet3-chain-train {parallel_train_opts} \ + --apply-deriv-weights={app_deriv_wts} \ + --l2-regularize={l2} --leaky-hmm-coefficient={leaky} \ + {cache_io_opts} --xent-regularize={xent_reg} {deriv_time_opts} \ + --print-interval=10 --momentum={momentum} \ + --max-param-change={max_param_change} \ + "{raw_model}" {dir}/den.fst \ + "ark,bg:nnet3-chain-copy-egs --truncate-deriv-weights={trunc_deriv} --frame-shift={fr_shft} ark:{egs_dir}/cegs.{archive_index}.ark ark:- | nnet3-chain-shuffle-egs --buffer-size={shuffle_buffer_size} --srand={iter} ark:- ark:-| nnet3-chain-merge-egs --minibatch-size={num_chunk_per_minibatch} ark:- ark:- |" \ + {dir}/{next_iter}.{job}.raw + """.format(command = run_opts.command, + train_queue_opt = run_opts.train_queue_opt, + dir = dir, iter = iter, next_iter = iter + 1, job = job, + deriv_time_opts = deriv_time_opts, + trunc_deriv = truncate_deriv_weights, + app_deriv_wts = apply_deriv_weights, + fr_shft = frame_shift, l2 = l2_regularize, + xent_reg = xent_regularize, leaky = leaky_hmm_coefficient, + parallel_train_opts = run_opts.parallel_train_opts, + momentum = momentum, max_param_change = max_param_change, + raw_model = raw_model_string, + egs_dir = egs_dir, archive_index = archive_index, + shuffle_buffer_size = shuffle_buffer_size, + cache_io_opts = cur_cache_io_opts, + num_chunk_per_minibatch = num_chunk_per_minibatch), + wait = False) + + processes.append(process_handle) + + all_success = True + for process in processes: + process.wait() + [stdout_value, stderr_value] = process.communicate() + if stderr_value.strip() != '': + print(stderr_value) + if process.returncode != 0: + all_success = False + + if not all_success: + open('{0}/.error'.format(dir), 'w').close() + raise Exception("There was error during training iteration {0}".format(iter)) + +def TrainOneIteration(dir, iter, egs_dir, + num_jobs, num_archives_processed, num_archives, + learning_rate, shrinkage_value, num_chunk_per_minibatch, + num_hidden_layers, add_layers_period, + apply_deriv_weights, left_deriv_truncate, right_deriv_truncate, + l2_regularize, xent_regularize, leaky_hmm_coefficient, + momentum, max_param_change, shuffle_buffer_size, + frame_subsampling_factor, truncate_deriv_weights, + run_opts): + + # Set off jobs doing some diagnostics, in the background. + # Use the egs dir from the previous iteration for the diagnostics + logger.info("Training neural net (pass {0})".format(iter)) + + chain_lib.ComputeTrainCvProbabilities(dir, iter, egs_dir, + l2_regularize, xent_regularize, leaky_hmm_coefficient, run_opts) + + if iter > 0: + chain_lib.ComputeProgress(dir, iter, run_opts) + + if iter > 0 and (iter <= (num_hidden_layers-1) * add_layers_period) and (iter % add_layers_period == 0): + + do_average = False # if we've just mixed up, don't do averaging but take the + # best. + cur_num_hidden_layers = 1 + iter / add_layers_period + config_file = "{0}/configs/layer{1}.config".format(dir, cur_num_hidden_layers) + raw_model_string = "nnet3-am-copy --raw=true --learning-rate={lr} {dir}/{iter}.mdl - | nnet3-init --srand={iter} - {config} - |".format(lr=learning_rate, dir=dir, iter=iter, config=config_file) + cache_io_opts = "" + else: + do_average = True + if iter == 0: + do_average = False # on iteration 0, pick the best, don't average. + raw_model_string = "nnet3-am-copy --raw=true --learning-rate={0} {1}/{2}.mdl - |".format(learning_rate, dir, iter) + cache_io_opts = "--read-cache={dir}/cache.{iter}".format(dir = dir, iter = iter) + + if do_average: + cur_num_chunk_per_minibatch = num_chunk_per_minibatch + cur_max_param_change = max_param_change + else: + # on iteration zero or when we just added a layer, use a smaller minibatch + # size (and we will later choose the output of just one of the jobs): the + # model-averaging isn't always helpful when the model is changing too fast + # (i.e. it can worsen the objective function), and the smaller minibatch + # size will help to keep the update stable. + cur_num_chunk_per_minibatch = num_chunk_per_minibatch / 2 + cur_max_param_change = float(max_param_change) / math.sqrt(2) + + TrainNewModels(dir, iter, num_jobs, num_archives_processed, num_archives, + raw_model_string, egs_dir, + apply_deriv_weights, + left_deriv_truncate, right_deriv_truncate, + l2_regularize, xent_regularize, leaky_hmm_coefficient, + momentum, cur_max_param_change, + shuffle_buffer_size, cur_num_chunk_per_minibatch, + frame_subsampling_factor, truncate_deriv_weights, + cache_io_opts, run_opts) + + [models_to_average, best_model] = train_lib.GetSuccessfulModels(num_jobs, '{0}/log/train.{1}.%.log'.format(dir,iter)) + nnets_list = [] + for n in models_to_average: + nnets_list.append("{0}/{1}.{2}.raw".format(dir, iter + 1, n)) + + if do_average: + # average the output of the different jobs. + train_lib.RunKaldiCommand(""" +{command} {dir}/log/average.{iter}.log \ +nnet3-average {nnet_list} - \| \ +nnet3-am-copy --scale={shrink} --set-raw-nnet=- {dir}/{iter}.mdl {dir}/{new_iter}.mdl + """.format(command = run_opts.command, + dir = dir, + iter = iter, + nnet_list = " ".join(nnets_list), + shrink = shrinkage_value, + new_iter = iter + 1)) + + else: + # choose the best model from different jobs + train_lib.RunKaldiCommand(""" +{command} {dir}/log/select.{iter}.log \ + nnet3-am-copy --scale={shrink} --set-raw-nnet={dir}/{next_iter}.{best_model_index}.raw {dir}/{iter}.mdl {dir}/{next_iter}.mdl + """.format(command = run_opts.command, + dir = dir, iter = iter, next_iter = iter + 1, + shrink = shrinkage_value, best_model_index = best_model)) + + try: + for i in range(1, num_jobs + 1): + os.remove("{0}/{1}.{2}.raw".format(dir, iter + 1, i)) + except OSError: + raise Exception("Error while trying to delete the raw models") + + new_model = "{0}/{1}.mdl".format(dir, iter + 1) + + if not os.path.isfile(new_model): + raise Exception("Could not find {0}, at the end of iteration {1}".format(new_model, iter)) + elif os.stat(new_model).st_size == 0: + raise Exception("{0} has size 0. Something went wrong in iteration {1}".format(new_model, iter)) + if os.path.exists("{0}/cache.{1}".format(dir, iter)): + os.remove("{0}/cache.{1}".format(dir, iter)) + +def CheckForRequiredFiles(feat_dir, tree_dir, lat_dir): + for file in ['{0}/feats.scp'.format(feat_dir), '{0}/ali.1.gz'.format(tree_dir), + '{0}/final.mdl'.format(tree_dir), '{0}/tree'.format(tree_dir), + '{0}/lat.1.gz'.format(lat_dir), '{0}/final.mdl'.format(lat_dir), + '{0}/num_jobs'.format(lat_dir), '{0}/splice_opts'.format(lat_dir)]: + if not os.path.isfile(file): + raise Exception('Expected {0} to exist.'.format(file)) + +# args is a Namespace with the required parameters +def Train(args, run_opts): + arg_string = pprint.pformat(vars(args)) + logger.info("Arguments for the experiment\n{0}".format(arg_string)) + + # Check files + CheckForRequiredFiles(args.feat_dir, args.tree_dir, args.lat_dir) + + # Set some variables. + num_jobs = train_lib.GetNumberOfJobs(args.tree_dir) + feat_dim = train_lib.GetFeatDim(args.feat_dir) + ivector_dim = train_lib.GetIvectorDim(args.online_ivector_dir) + + # split the training data into parts for individual jobs + # we will use the same number of jobs as that used for alignment + train_lib.SplitData(args.feat_dir, num_jobs) + shutil.copy('{0}/tree'.format(args.tree_dir), args.dir) + f = open('{0}/num_jobs'.format(args.dir), 'w') + f.write(str(num_jobs)) + f.close() + + config_dir = '{0}/configs'.format(args.dir) + var_file = '{0}/vars'.format(config_dir) + + [model_left_context, model_right_context, num_hidden_layers] = train_lib.ParseModelConfigVarsFile(var_file) + # Initialize as "raw" nnet, prior to training the LDA-like preconditioning + # matrix. This first config just does any initial splicing that we do; + # we do this as it's a convenient way to get the stats for the 'lda-like' + # transform. + if (args.stage <= -6): + logger.info("Creating phone language-model") + chain_lib.CreatePhoneLm(args.dir, args.tree_dir, run_opts, lm_opts = args.lm_opts) + + if (args.stage <= -5): + logger.info("Creating denominator FST") + chain_lib.CreateDenominatorFst(args.dir, args.tree_dir, run_opts) + + if (args.stage <= -4): + logger.info("Initializing a basic network for estimating preconditioning matrix") + train_lib.RunKaldiCommand(""" +{command} {dir}/log/nnet_init.log \ + nnet3-init --srand=-2 {dir}/configs/init.config {dir}/init.raw + """.format(command = run_opts.command, + dir = args.dir)) + + left_context = args.chunk_left_context + model_left_context + right_context = args.chunk_right_context + model_right_context + + default_egs_dir = '{0}/egs'.format(args.dir) + if (args.stage <= -3) and args.egs_dir is None: + logger.info("Generating egs") + # this is where get_egs.sh is called. + chain_lib.GenerateChainEgs(args.dir, args.feat_dir, args.lat_dir, default_egs_dir, + left_context + args.frame_subsampling_factor/2, + right_context + args.frame_subsampling_factor/2, + run_opts, + left_tolerance = args.left_tolerance, + right_tolerance = args.right_tolerance, + frame_subsampling_factor = args.frame_subsampling_factor, + alignment_subsampling_factor = args.alignment_subsampling_factor, + frames_per_eg = args.chunk_width, + egs_opts = args.egs_opts, + cmvn_opts = args.cmvn_opts, + online_ivector_dir = args.online_ivector_dir, + frames_per_iter = args.frames_per_iter, + transform_dir = args.transform_dir, + stage = args.egs_stage) + + if args.egs_dir is None: + egs_dir = default_egs_dir + else: + egs_dir = args.egs_dir + + [egs_left_context, egs_right_context, frames_per_eg, num_archives] = train_lib.VerifyEgsDir(egs_dir, feat_dim, ivector_dim, left_context, right_context) + assert(args.chunk_width == frames_per_eg) + num_archives_expanded = num_archives * args.frame_subsampling_factor + + if (args.num_jobs_final > num_archives_expanded): + raise Exception('num_jobs_final cannot exceed the expanded number of archives') + + # copy the properties of the egs to dir for + # use during decoding + train_lib.CopyEgsPropertiesToExpDir(egs_dir, args.dir) + + if (args.stage <= -2): + logger.info('Computing the preconditioning matrix for input features') + + chain_lib.ComputePreconditioningMatrix(args.dir, egs_dir, num_archives, run_opts, + max_lda_jobs = args.max_lda_jobs, + rand_prune = args.rand_prune) + + if (args.stage <= -1): + logger.info("Preparing the initial acoustic model.") + chain_lib.PrepareInitialAcousticModel(args.dir, run_opts) + + file_handle = open("{0}/frame_subsampling_factor".format(args.dir),"w") + file_handle.write(str(args.frame_subsampling_factor)) + file_handle.close() + + # set num_iters so that as close as possible, we process the data $num_epochs + # times, i.e. $num_iters*$avg_num_jobs) == $num_epochs*$num_archives, + # where avg_num_jobs=(num_jobs_initial+num_jobs_final)/2. + num_archives_to_process = args.num_epochs * num_archives_expanded + num_archives_processed = 0 + num_iters=(num_archives_to_process * 2) / (args.num_jobs_initial + args.num_jobs_final) + + num_iters_combine = train_lib.VerifyIterations(num_iters, args.num_epochs, + num_hidden_layers, num_archives_expanded, + args.max_models_combine, args.add_layers_period, + args.num_jobs_final) + + learning_rate = lambda iter, current_num_jobs, num_archives_processed: train_lib.GetLearningRate(iter, current_num_jobs, num_iters, + num_archives_processed, + num_archives_to_process, + args.initial_effective_lrate, + args.final_effective_lrate) + + logger.info("Training will run for {0} epochs = {1} iterations".format(args.num_epochs, num_iters)) + for iter in range(num_iters): + if (args.exit_stage is not None) and (iter == args.exit_stage): + logger.info("Exiting early due to --exit-stage {0}".format(iter)) + return + current_num_jobs = int(0.5 + args.num_jobs_initial + (args.num_jobs_final - args.num_jobs_initial) * float(iter) / num_iters) + + if args.stage <= iter: + if args.shrink_value != 1.0: + model_file = "{dir}/{iter}.mdl".format(dir = args.dir, iter = iter) + shrinkage_value = args.shrink_value if train_lib.DoShrinkage(iter, model_file, args.shrink_nonlinearity, args.shrink_threshold) else 1 + else: + shrinkage_value = args.shrink_value + logger.info("On iteration {0}, learning rate is {1} and shrink value is {2}.".format(iter, learning_rate(iter, current_num_jobs, num_archives_processed), shrinkage_value)) + + TrainOneIteration(args.dir, iter, egs_dir, current_num_jobs, + num_archives_processed, num_archives, + learning_rate(iter, current_num_jobs, num_archives_processed), + shrinkage_value, + args.num_chunk_per_minibatch, + num_hidden_layers, args.add_layers_period, + args.apply_deriv_weights, args.left_deriv_truncate, args.right_deriv_truncate, + args.l2_regularize, args.xent_regularize, args.leaky_hmm_coefficient, + args.momentum, args.max_param_change, + args.shuffle_buffer_size, + args.frame_subsampling_factor, + args.truncate_deriv_weights, run_opts) + if args.cleanup: + # do a clean up everythin but the last 2 models, under certain conditions + train_lib.RemoveModel(args.dir, iter-2, num_iters, num_iters_combine, + args.preserve_model_interval) + + if args.email is not None: + reporting_iter_interval = num_iters * args.reporting_interval + if iter % reporting_iter_interval == 0: + # lets do some reporting + [report, times, data] = nnet3_log_parse.GenerateAccuracyReport(args.dir, key="log-probability") + message = report + subject = "Update : Expt {dir} : Iter {iter}".format(dir = args.dir, iter = iter) + train_lib.SendMail(message, subject, args.email) + + num_archives_processed = num_archives_processed + current_num_jobs + + if args.stage <= num_iters: + logger.info("Doing final combination to produce final.mdl") + chain_lib.CombineModels(args.dir, num_iters, num_iters_combine, + args.num_chunk_per_minibatch, egs_dir, + args.leaky_hmm_coefficient, args.l2_regularize, + args.xent_regularize, run_opts) + + if args.cleanup: + logger.info("Cleaning up the experiment directory {0}".format(args.dir)) + remove_egs = args.remove_egs + if args.egs_dir is not None: + # this egs_dir was not created by this experiment so we will not + # delete it + remove_egs = False + + train_lib.CleanNnetDir(args.dir, num_iters, egs_dir, + preserve_model_interval = args.preserve_model_interval, + remove_egs = remove_egs) + + # do some reporting + [report, times, data] = nnet3_log_parse.GenerateAccuracyReport(args.dir, "log-probability") + if args.email is not None: + train_lib.SendMail(report, "Update : Expt {0} : complete".format(args.dir), args.email) + + report_handle = open("{dir}/accuracy.report".format(dir = args.dir), "w") + report_handle.write(report) + report_handle.close() + +def Main(): + [args, run_opts] = GetArgs() + try: + Train(args, run_opts) + except Exception as e: + if args.email is not None: + message = "Training session for experiment {dir} died due to an error.".format(dir = args.dir) + sendMail(message, message, args.email) + traceback.print_exc() + raise e + +if __name__ == "__main__": + Main() diff --git a/egs/wsj/s5/steps/nnet3/chain/train_tdnn.sh b/egs/wsj/s5/steps/nnet3/chain/train_tdnn.sh new file mode 100755 index 00000000000..036da48cdc9 --- /dev/null +++ b/egs/wsj/s5/steps/nnet3/chain/train_tdnn.sh @@ -0,0 +1,645 @@ +#!/bin/bash + +# note, TDNN is the same as what we used to call multisplice. +# This version of the script, nnet3/chain/train_tdnn.sh, is for 'chain' systems. + +# Copyright 2012-2015 Johns Hopkins University (Author: Daniel Povey). +# 2013 Xiaohui Zhang +# 2013 Guoguo Chen +# 2014 Vimal Manohar +# 2014 Vijayaditya Peddinti +# Apache 2.0. + + +# Begin configuration section. +cmd=run.pl +num_epochs=10 # Number of epochs of training; + # the number of iterations is worked out from this. + # Be careful with this: we actually go over the data + # num-epochs * frame-subsampling-factor times, due to + # using different data-shifts. +truncate_deriv_weights=0 # can be used to set to zero the weights of derivs from frames + # near the edges. (counts subsampled frames). +apply_deriv_weights=true +initial_effective_lrate=0.0002 +final_effective_lrate=0.00002 +extra_left_context=0 # actually for recurrent setups. +pnorm_input_dim=3000 +pnorm_output_dim=300 +relu_dim= # you can use this to make it use ReLU's instead of p-norms. + +jesus_opts= # opts to steps/nnet3/make_jesus_configs.py. + # If nonempty, assumes you want to use the jesus nonlinearity, + # and you should supply various options to that script in + # this string. +rand_prune=4.0 # Relates to a speedup we do for LDA. +minibatch_size=512 # This default is suitable for GPU-based training. + # Set it to 128 for multi-threaded CPU-based training. +lm_opts= # options to chain-est-phone-lm +l2_regularize=0.0 +leaky_hmm_coefficient=0.00001 +xent_regularize=0.0 +frames_per_iter=800000 # each iteration of training, see this many [input] + # frames per job. This option is passed to get_egs.sh. + # Aim for about a minute of training time +right_tolerance=5 # tolerance at the same frame-rate as the alignment directory. +left_tolerance=5 # tolerance at the same frame-rate as the alignment directory. +num_jobs_initial=1 # Number of neural net jobs to run in parallel at the start of training +num_jobs_final=8 # Number of neural net jobs to run in parallel at the end of training +frame_subsampling_factor=3 # ratio of frames-per-second of features we train + # on, to chain model's output +alignment_subsampling_factor=3 # ratio of frames-per-second of input alignments + # to chain model's output +get_egs_stage=0 # can be used for rerunning after partial +online_ivector_dir= +max_param_change=2.0 +remove_egs=true # set to false to disable removing egs after training is done. + +max_models_combine=20 # The "max_models_combine" is the maximum number of models we give + # to the final 'combine' stage, but these models will themselves be averages of + # iteration-number ranges. +ngram_order=3 + +shuffle_buffer_size=5000 # This "buffer_size" variable controls randomization of the samples + # on each iter. You could set it to 0 or to a large value for complete + # randomization, but this would both consume memory and cause spikes in + # disk I/O. Smaller is easier on disk and memory but less random. It's + # not a huge deal though, as samples are anyway randomized right at the start. + # (the point of this is to get data in different minibatches on different iterations, + # since in the preconditioning method, 2 samples in the same minibatch can + # affect each others' gradients. +final_layer_normalize_target=1.0 # you can set this to less than one if you + # think the final layer is learning too fast + # compared with the other layers. +add_layers_period=2 # by default, add new layers every 2 iterations. +stage=-7 +exit_stage=-100 # you can set this to terminate the training early. Exits before running this stage + + +# count space-separated fields in splice_indexes to get num-hidden-layers. +splice_indexes="-4,-3,-2,-1,0,1,2,3,4 0 -2,2 0 -4,4 0" +pool_type='none' +pool_window= +pool_lpfilter_width= + +# Format : layer/....layer/ " +# note: hidden layers which are composed of one or more components, +# so hidden layer indexing is different from component count + +randprune=4.0 # speeds up LDA. +use_gpu=true # if true, we run on GPU. +cleanup=true +egs_dir= +max_lda_jobs=20 # use no more than 20 jobs for the LDA accumulation. +lda_opts= +egs_opts= +transform_dir= # If supplied, this dir used instead of latdir to find transforms. +cmvn_opts= # will be passed to get_lda.sh and get_egs.sh, if supplied. + # only relevant for "raw" features, not lda. +feat_type=raw # or set to 'lda' to use LDA features. +frames_per_eg=25 # number of frames of output per chunk. To be passed on to get_egs.sh. +left_deriv_truncate= # number of time-steps to avoid using the deriv of, on the left. +right_deriv_truncate= # number of time-steps to avoid using the deriv of, on the right. + +# End configuration section. + +trap 'for pid in $(jobs -pr); do kill -TERM $pid; done' INT QUIT TERM + +echo "$0 $@" # Print the command line for logging + +if [ -f path.sh ]; then . ./path.sh; fi +. parse_options.sh || exit 1; + +if [ $# != 4 ]; then + echo "Usage: $0 [opts] " + echo " e.g.: $0 data/train exp/chain/tri3b_tree exp/tri3_latali exp/chain/tdnn_a" + echo "" + echo "Main options (for others, see top of script file)" + echo " --config # config file containing options" + echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs." + echo " --num-epochs <#epochs|15> # Number of epochs of training" + echo " --initial-effective-lrate # effective learning rate at start of training." + echo " --final-effective-lrate # effective learning rate at end of training." + echo " # data, 0.00025 for large data" + echo " --num-hidden-layers <#hidden-layers|2> # Number of hidden layers, e.g. 2 for 3 hours of data, 4 for 100hrs" + echo " --add-layers-period <#iters|2> # Number of iterations between adding hidden layers" + echo " --num-jobs-initial # Number of parallel jobs to use for neural net training, at the start." + echo " --num-jobs-final # Number of parallel jobs to use for neural net training, at the end" + echo " --num-threads # Number of parallel threads per job, for CPU-based training (will affect" + echo " # results as well as speed; may interact with batch size; if you increase" + echo " # this, you may want to decrease the batch size." + echo " --parallel-opts # extra options to pass to e.g. queue.pl for processes that" + echo " # use multiple threads... note, you might have to reduce mem_free,ram_free" + echo " # versus your defaults, because it gets multiplied by the -pe smp argument." + echo " --io-opts # Options given to e.g. queue.pl for jobs that do a lot of I/O." + echo " --minibatch-size # Size of minibatch to process (note: product with --num-threads" + echo " # should not get too large, e.g. >2k)." + echo " --frames-per-iter <#frames|400000> # Number of frames of data to process per iteration, per" + echo " # process." + echo " --splice-indexes " + echo " # Frame indices used for each splice layer." + echo " # Format : layer/....layer/ " + echo " # (note: we splice processed, typically 40-dimensional frames" + echo " --lda-dim # Dimension to reduce spliced features to with LDA" + echo " --stage # Used to run a partially-completed training process from somewhere in" + echo " # the middle." + + + exit 1; +fi + +data=$1 +treedir=$2 +latdir=$3 +dir=$4 + + +# Check some files. +for f in $data/feats.scp $treedir/ali.1.gz $treedir/final.mdl $treedir/tree \ + $latdir/lat.1.gz $latdir/final.mdl $latdir/num_jobs $latdir/splice_opts; do + [ ! -f $f ] && echo "$0: no such file $f" && exit 1; +done + + +# Set some variables. +nj=`cat $treedir/num_jobs` || exit 1; # number of jobs in alignment dir... + +sdata=$data/split$nj +utils/split_data.sh $data $nj + +mkdir -p $dir/log +echo $nj > $dir/num_jobs +cp $treedir/tree $dir + + +# First work out the feature and iVector dimension, needed for tdnn config creation. +case $feat_type in + raw) feat_dim=$(feat-to-dim --print-args=false scp:$data/feats.scp -) || \ + { echo "$0: Error getting feature dim"; exit 1; } + ;; + lda) [ ! -f $treedir/final.mat ] && echo "$0: With --feat-type lda option, expect $treedir/final.mat to exist." + # get num-rows in lda matrix, which is the lda feature dim. + feat_dim=$(matrix-dim --print-args=false $treedir/final.mat | cut -f 1) + ;; + *) + echo "$0: Bad --feat-type '$feat_type';"; exit 1; +esac +if [ -z "$online_ivector_dir" ]; then + ivector_dim=0 +else + ivector_dim=$(feat-to-dim scp:$online_ivector_dir/ivector_online.scp -) || exit 1; +fi + +if [ $stage -le -7 ]; then + echo "$0: creating phone language-model" + + $cmd $dir/log/make_phone_lm.log \ + chain-est-phone-lm $lm_opts \ + "ark:gunzip -c $treedir/ali.*.gz | ali-to-phones $treedir/final.mdl ark:- ark:- |" \ + $dir/phone_lm.fst || exit 1 +fi + +if [ $stage -le -6 ]; then + echo "$0: creating denominator FST" + copy-transition-model $treedir/final.mdl $dir/0.trans_mdl + $cmd $dir/log/make_den_fst.log \ + chain-make-den-fst $dir/tree $dir/0.trans_mdl $dir/phone_lm.fst \ + $dir/den.fst $dir/normalization.fst || exit 1; +fi + +# work out num-leaves +num_leaves=$(am-info $dir/0.trans_mdl | grep -w pdfs | awk '{print $NF}') || exit 1; +[ $num_leaves -gt 0 ] || exit 1; + +if [ $stage -le -5 ]; then + echo "$0: creating neural net configs"; + + if [ ! -z "$jesus_opts" ]; then + $cmd $dir/log/make_configs.log \ + python steps/nnet3/make_jesus_configs.py \ + --xent-regularize=$xent_regularize \ + --include-log-softmax=false \ + --splice-indexes "$splice_indexes" \ + --feat-dim $feat_dim \ + --ivector-dim $ivector_dim \ + $jesus_opts \ + --num-targets $num_leaves \ + $dir/configs || exit 1; + else + [ $xent_regularize != "0.0" ] && \ + echo "$0: --xent-regularize option not supported by tdnn/make_configs.py." && exit 1; + if [ ! -z "$relu_dim" ]; then + dim_opts="--relu-dim $relu_dim" + else + dim_opts="--pnorm-input-dim $pnorm_input_dim --pnorm-output-dim $pnorm_output_dim" + fi + + # create the config files for nnet initialization + pool_opts= + pool_opts=$pool_opts${pool_type:+" --pool-type $pool_type "} + pool_opts=$pool_opts${pool_window:+" --pool-window $pool_window "} + pool_opts=$pool_opts${pool_lpfilter_width:+" --pool-lpfilter-width $pool_lpfilter_width "} + + python steps/nnet3/tdnn/make_configs.py $pool_opts \ + --include-log-softmax=false \ + --final-layer-normalize-target $final_layer_normalize_target \ + --splice-indexes "$splice_indexes" \ + --feat-dim $feat_dim \ + --ivector-dim $ivector_dim \ + $dim_opts \ + --num-targets $num_leaves \ + --use-presoftmax-prior-scale false \ + $dir/configs || exit 1; + fi + + # Initialize as "raw" nnet, prior to training the LDA-like preconditioning + # matrix. This first config just does any initial splicing that we do; + # we do this as it's a convenient way to get the stats for the 'lda-like' + # transform. + $cmd $dir/log/nnet_init.log \ + nnet3-init --srand=-2 $dir/configs/init.config $dir/init.raw || exit 1; +fi + +# sourcing the "vars" below sets +# left_context=(something) +# right_context=(something) +# num_hidden_layers=(something) +. $dir/configs/vars || exit 1; + +# the next 2 lines are in case the configs were created by an older +# config-generating script, which writes to left_context and right_context +# instead of model_left_context and model_right_context. +[ -z $model_left_context ] && model_left_context=$left_context +[ -z $model_right_context ] && model_right_context=$right_context + +! [ "$num_hidden_layers" -gt 0 ] && echo \ + "$0: Expected num_hidden_layers to be defined" && exit 1; + +[ -z "$transform_dir" ] && transform_dir=$latdir + +if [ $stage -le -4 ] && [ -z "$egs_dir" ]; then + extra_opts=() + [ ! -z "$cmvn_opts" ] && extra_opts+=(--cmvn-opts "$cmvn_opts") + [ ! -z "$feat_type" ] && extra_opts+=(--feat-type $feat_type) + [ ! -z "$online_ivector_dir" ] && extra_opts+=(--online-ivector-dir $online_ivector_dir) + extra_opts+=(--transform-dir $transform_dir) + # we need a bit of extra left-context and right-context to allow for frame + # shifts (we use shifted version of the data for more variety). + extra_opts+=(--left-context $[$model_left_context+$frame_subsampling_factor/2+$extra_left_context]) + extra_opts+=(--right-context $[$model_right_context+$frame_subsampling_factor/2]) + echo "$0: calling get_egs.sh" + steps/nnet3/chain/get_egs.sh $egs_opts "${extra_opts[@]}" \ + --frames-per-iter $frames_per_iter --stage $get_egs_stage \ + --cmd "$cmd" \ + --right-tolerance "$right_tolerance" \ + --left-tolerance "$left_tolerance" \ + --frames-per-eg $frames_per_eg \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor $alignment_subsampling_factor \ + $data $dir $latdir $dir/egs || exit 1; +fi + +[ -z $egs_dir ] && egs_dir=$dir/egs + +if [ "$feat_dim" != "$(cat $egs_dir/info/feat_dim)" ]; then + echo "$0: feature dimension mismatch with egs in $egs_dir: $feat_dim vs $(cat $egs_dir/info/feat_dim)"; + exit 1; +fi +if [ "$ivector_dim" != "$(cat $egs_dir/info/ivector_dim)" ]; then + echo "$0: ivector dimension mismatch with egs in $egs_dir: $ivector_dim vs $(cat $egs_dir/info/ivector_dim)"; + exit 1; +fi + +# copy any of the following that exist, to $dir. +cp $egs_dir/{cmvn_opts,splice_opts,final.mat} $dir 2>/dev/null + +# confirm that the egs_dir has the necessary context (especially important if +# the --egs-dir option was used on the command line). +egs_left_context=$(cat $egs_dir/info/left_context) || exit -1 +egs_right_context=$(cat $egs_dir/info/right_context) || exit -1 +( [ $egs_left_context -lt $model_left_context ] || \ + [ $egs_right_context -lt $model_right_context ] ) && \ + echo "$0: egs in $egs_dir have too little context" && exit -1; + +frames_per_eg=$(cat $egs_dir/info/frames_per_eg) || { echo "error: no such file $egs_dir/info/frames_per_eg"; exit 1; } +num_archives=$(cat $egs_dir/info/num_archives) || { echo "error: no such file $egs_dir/info/frames_per_eg"; exit 1; } + +num_archives_expanded=$[$num_archives*$frame_subsampling_factor] + +[ $num_jobs_initial -gt $num_jobs_final ] && \ + echo "$0: --initial-num-jobs cannot exceed --final-num-jobs" && exit 1; + +[ $num_jobs_final -gt $num_archives_expanded ] && \ + echo "$0: --final-num-jobs cannot exceed #archives $num_archives_expanded." && exit 1; + +if [ $stage -le -3 ]; then + echo "$0: getting preconditioning matrix for input features." + num_lda_jobs=$num_archives + [ $num_lda_jobs -gt $max_lda_jobs ] && num_lda_jobs=$max_lda_jobs + + # Write stats with the same format as stats for LDA. + $cmd JOB=1:$num_lda_jobs $dir/log/get_lda_stats.JOB.log \ + nnet3-chain-acc-lda-stats --rand-prune=$rand_prune \ + $dir/init.raw "ark:$egs_dir/cegs.JOB.ark" $dir/JOB.lda_stats || exit 1; + + all_lda_accs=$(for n in $(seq $num_lda_jobs); do echo $dir/$n.lda_stats; done) + $cmd $dir/log/sum_transform_stats.log \ + sum-lda-accs $dir/lda_stats $all_lda_accs || exit 1; + + rm $all_lda_accs || exit 1; + + # this computes a fixed affine transform computed in the way we described in + # Appendix C.6 of http://arxiv.org/pdf/1410.7455v6.pdf; it's a scaled variant + # of an LDA transform but without dimensionality reduction. + $cmd $dir/log/get_transform.log \ + nnet-get-feature-transform $lda_opts $dir/lda.mat $dir/lda_stats || exit 1; + + ln -sf ../lda.mat $dir/configs/lda.mat +fi + +if [ $stage -le -1 ]; then + # Add the first layer; this will add in the lda.mat and + # presoftmax_prior_scale.vec. + + echo "$0: creating initial raw model" + $cmd $dir/log/add_first_layer.log \ + nnet3-init --srand=-1 $dir/init.raw $dir/configs/layer1.config $dir/0.raw || exit 1; + + + # The model-format for a 'chain' acoustic model is just the transition + # model and then the raw nnet, so we can use 'cat' to create this, as + # long as they have the same mode (binary or not binary). + # We ensure that they have the same mode (even if someone changed the + # script to make one or both of them text mode) by copying them both + # before concatenating them. + + echo "$0: creating initial model" + $cmd $dir/log/init_model.log \ + nnet3-am-init $dir/0.trans_mdl $dir/0.raw $dir/0.mdl || exit 1; +fi + +echo $frame_subsampling_factor >$dir/frame_subsampling_factor || exit 1; + +# set num_iters so that as close as possible, we process the data $num_epochs +# times, i.e. $num_iters*$avg_num_jobs) == $num_epochs*$num_archives_expanded +# where avg_num_jobs=(num_jobs_initial+num_jobs_final)/2. + +num_archives_to_process=$[$num_epochs*$num_archives_expanded] +num_archives_processed=0 +num_iters=$[($num_archives_to_process*2)/($num_jobs_initial+$num_jobs_final)] + +! [ $num_iters -gt $[$finish_add_layers_iter+2] ] \ + && echo "$0: Insufficient epochs" && exit 1 + +finish_add_layers_iter=$[$num_hidden_layers * $add_layers_period] + +echo "$0: Will train for $num_epochs epochs = $num_iters iterations" + +if $use_gpu; then + parallel_suffix="" + train_queue_opt="--gpu 1" + combine_queue_opt="--gpu 1" + prior_gpu_opt="--use-gpu=yes" + prior_queue_opt="--gpu 1" + parallel_train_opts= + if ! cuda-compiled; then + echo "$0: WARNING: you are running with one thread but you have not compiled" + echo " for CUDA. You may be running a setup optimized for GPUs. If you have" + echo " GPUs and have nvcc installed, go to src/ and do ./configure; make" + exit 1 + fi +else + echo "$0: without using a GPU this will be very slow. nnet3 does not yet support multiple threads." + parallel_train_opts="--use-gpu=no" + train_queue_opt="--num-threads $num_threads" + combine_queue_opt="" # the combine stage will be quite slow if not using + # GPU, as we didn't enable that program to use + # multiple threads. + prior_gpu_opt="--use-gpu=no" + prior_queue_opt="" +fi + + +approx_iters_per_epoch_final=$[$num_archives_expanded/$num_jobs_final] +# First work out how many iterations we want to combine over in the final +# nnet3-combine-fast invocation. (We may end up subsampling from these if the +# number exceeds max_model_combine). The number we use is: +# min(max(max_models_combine, approx_iters_per_epoch_final), +# 1/2 * iters_after_last_layer_added) +num_iters_combine=$max_models_combine +if [ $num_iters_combine -lt $approx_iters_per_epoch_final ]; then + num_iters_combine=$approx_iters_per_epoch_final +fi +half_iters_after_add_layers=$[($num_iters-$finish_add_layers_iter)/2] +if [ $num_iters_combine -gt $half_iters_after_add_layers ]; then + num_iters_combine=$half_iters_after_add_layers +fi +first_model_combine=$[$num_iters-$num_iters_combine+1] + +x=0 + +deriv_time_opts= +[ ! -z "$left_deriv_truncate" ] && deriv_time_opts="--optimization.min-deriv-time=$left_deriv_truncate" +[ ! -z "$right_deriv_truncate" ] && \ + deriv_time_opts="$deriv_time_opts --optimization.max-deriv-time=$((frames_per_eg - right_deriv_truncate))" + + +while [ $x -lt $num_iters ]; do + [ $x -eq $exit_stage ] && echo "$0: Exiting early due to --exit-stage $exit_stage" && exit 0; + + this_num_jobs=$(perl -e "print int(0.5+$num_jobs_initial+($num_jobs_final-$num_jobs_initial)*$x/$num_iters);") + + ilr=$initial_effective_lrate; flr=$final_effective_lrate; np=$num_archives_processed; nt=$num_archives_to_process; + this_learning_rate=$(perl -e "print (($x + 1 >= $num_iters ? $flr : $ilr*exp($np*log($flr/$ilr)/$nt))*$this_num_jobs);"); + + echo "On iteration $x, learning rate is $this_learning_rate." + + + if [ $x -ge 0 ] && [ $stage -le $x ]; then + + # Set off jobs doing some diagnostics, in the background. + # Use the egs dir from the previous iteration for the diagnostics + $cmd $dir/log/compute_prob_valid.$x.log \ + nnet3-chain-compute-prob --l2-regularize=$l2_regularize --leaky-hmm-coefficient=$leaky_hmm_coefficient --xent-regularize=$xent_regularize \ + "nnet3-am-copy --raw=true $dir/$x.mdl -|" $dir/den.fst \ + "ark,bg:nnet3-chain-merge-egs ark:$egs_dir/valid_diagnostic.cegs ark:- |" & + $cmd $dir/log/compute_prob_train.$x.log \ + nnet3-chain-compute-prob --l2-regularize=$l2_regularize --leaky-hmm-coefficient=$leaky_hmm_coefficient --xent-regularize=$xent_regularize \ + "nnet3-am-copy --raw=true $dir/$x.mdl -|" $dir/den.fst \ + "ark,bg:nnet3-chain-merge-egs ark:$egs_dir/train_diagnostic.cegs ark:- |" & + + if [ $x -gt 0 ]; then + # This doesn't use the egs, it only shows the relative change in model parameters. + $cmd $dir/log/progress.$x.log \ + nnet3-show-progress --use-gpu=no "nnet3-am-copy --raw=true $dir/$[$x-1].mdl - |" \ + "nnet3-am-copy --raw=true $dir/$x.mdl - |" '&&' \ + nnet3-am-info $dir/$x.mdl & + fi + + echo "Training neural net (pass $x)" + + if [ $x -gt 0 ] && \ + [ $x -le $[($num_hidden_layers-1)*$add_layers_period] ] && \ + [ $[$x%$add_layers_period] -eq 0 ]; then + do_average=false # if we've just mixed up, don't do averaging but take the + # best. + cur_num_hidden_layers=$[1+$x/$add_layers_period] + config=$dir/configs/layer$cur_num_hidden_layers.config + mdl="nnet3-am-copy --raw=true --learning-rate=$this_learning_rate $dir/$x.mdl - | nnet3-init --srand=$x - $config - |" + cache_io_opts="" + else + do_average=true + if [ $x -eq 0 ]; then do_average=false; fi # on iteration 0, pick the best, don't average. + mdl="nnet3-am-copy --raw=true --learning-rate=$this_learning_rate $dir/$x.mdl -|" + cache_io_opts="--read-cache=$dir/cache.$x" + fi + if $do_average; then + this_minibatch_size=$minibatch_size + this_max_param_change=$max_param_change + else + # on iteration zero or when we just added a layer, use a smaller minibatch + # size (and we will later choose the output of just one of the jobs): the + # model-averaging isn't always helpful when the model is changing too fast + # (i.e. it can worsen the objective function), and the smaller minibatch + # size will help to keep the update stable. + this_minibatch_size=$[$minibatch_size/2]; + this_max_param_change=$(perl -e "print ($max_param_change/sqrt(2));") + fi + + rm $dir/.error 2>/dev/null + + + ( + trap 'for pid in $(jobs -pr); do kill -TERM $pid; done' INT QUIT TERM + # this sub-shell is so that when we "wait" below, + # we only wait for the training jobs that we just spawned, + # not the diagnostic jobs that we spawned above. + + # We can't easily use a single parallel SGE job to do the main training, + # because the computation of which archive and which --frame option + # to use for each job is a little complex, so we spawn each one separately. + for n in $(seq $this_num_jobs); do + k=$[$num_archives_processed + $n - 1]; # k is a zero-based index that we'll derive + # the other indexes from. + archive=$[($k%$num_archives)+1]; # work out the 1-based archive index. + frame_shift=$[($k/$num_archives)%$frame_subsampling_factor]; + if [ $n -eq 1 ]; then + # opts for computation cache (storing compiled computation). + this_cache_io_opts="$cache_io_opts --write-cache=$dir/cache.$[$x+1]" + else + this_cache_io_opts="$cache_io_opts" + fi + $cmd $train_queue_opt $dir/log/train.$x.$n.log \ + nnet3-chain-train --apply-deriv-weights=$apply_deriv_weights \ + --l2-regularize=$l2_regularize --leaky-hmm-coefficient=$leaky_hmm_coefficient --xent-regularize=$xent_regularize \ + $this_cache_io_opts $parallel_train_opts $deriv_time_opts \ + --max-param-change=$this_max_param_change \ + --print-interval=10 "$mdl" $dir/den.fst \ + "ark,bg:nnet3-chain-copy-egs --truncate-deriv-weights=$truncate_deriv_weights --frame-shift=$frame_shift ark:$egs_dir/cegs.$archive.ark ark:- | nnet3-chain-shuffle-egs --buffer-size=$shuffle_buffer_size --srand=$x ark:- ark:-| nnet3-chain-merge-egs --minibatch-size=$this_minibatch_size ark:- ark:- |" \ + $dir/$[$x+1].$n.raw || touch $dir/.error & + done + wait + ) + # the error message below is not that informative, but $cmd will + # have printed a more specific one. + [ -f $dir/.error ] && echo "$0: error on iteration $x of training" && exit 1; + + models_to_average=$(steps/nnet3/get_successful_models.py --difference-threshold 0.1 $this_num_jobs $dir/log/train.$x.%.log) + nnets_list= + for n in $models_to_average; do + nnets_list="$nnets_list $dir/$[$x+1].$n.raw" + done + + if $do_average; then + # average the output of the different jobs. + $cmd $dir/log/average.$x.log \ + nnet3-average $nnets_list - \| \ + nnet3-am-copy --set-raw-nnet=- $dir/$x.mdl $dir/$[$x+1].mdl || exit 1; + else + # choose the best from the different jobs. + n=$(perl -e '($nj,$pat)=@ARGV; $best_n=1; $best_logprob=-1.0e+10; for ($n=1;$n<=$nj;$n++) { + $fn = sprintf($pat,$n); open(F, "<$fn") || die "Error opening log file $fn"; + undef $logprob; while () { if (m/log-prob-per-frame=(\S+)/) { $logprob=$1; } } + close(F); if (defined $logprob && $logprob > $best_logprob) { $best_logprob=$logprob; + $best_n=$n; } } print "$best_n\n"; ' $num_jobs_nnet $dir/log/train.$x.%d.log) || exit 1; + [ -z "$n" ] && echo "Error getting best model" && exit 1; + $cmd $dir/log/select.$x.log \ + nnet3-am-copy --set-raw-nnet=$dir/$[$x+1].$n.raw $dir/$x.mdl $dir/$[$x+1].mdl || exit 1; + fi + + rm $nnets_list + [ ! -f $dir/$[$x+1].mdl ] && exit 1; + if [ -f $dir/$[$x-1].mdl ] && $cleanup && \ + [ $[($x-1)%10] -ne 0 ] && [ $[$x-1] -lt $first_model_combine ]; then + rm $dir/$[$x-1].mdl + fi + fi + rm $dir/cache.$x 2>/dev/null + x=$[$x+1] + num_archives_processed=$[$num_archives_processed+$this_num_jobs] +done + + +if [ $stage -le $num_iters ]; then + echo "Doing final combination to produce final.mdl" + + # Now do combination. In the nnet3 setup, the logic + # for doing averaging of subsets of the models in the case where + # there are too many models to reliably esetimate interpolation + # factors (max_models_combine) is moved into the nnet3-combine + nnets_list=() + for n in $(seq 0 $[num_iters_combine-1]); do + iter=$[$first_model_combine+$n] + [ ! -f $dir/$iter.mdl ] && echo "Expected $mdl to exist" && exit 1; + mdl="nnet3-am-copy --raw=true $dir/$iter.mdl - |" + nnets_list[$n]="$mdl"; + done + + # Below, we use --use-gpu=no to disable nnet3-combine-fast from using a GPU, + # as if there are many models it can give out-of-memory error; and we set + # num-threads to 8 to speed it up (this isn't ideal...) + + $cmd $combine_queue_opt $dir/log/combine.log \ + nnet3-chain-combine --num-iters=40 --l2-regularize=$l2_regularize --leaky-hmm-coefficient=$leaky_hmm_coefficient \ + --enforce-sum-to-one=true --enforce-positive-weights=true \ + --verbose=3 $dir/den.fst "${nnets_list[@]}" "ark,bg:nnet3-chain-merge-egs --minibatch-size=$minibatch_size ark:$egs_dir/combine.cegs ark:-|" \ + "|nnet3-am-copy --set-raw-nnet=- $dir/$first_model_combine.mdl $dir/final.mdl" || exit 1; + + + # Compute the probability of the final, combined model with + # the same subset we used for the previous compute_probs, as the + # different subsets will lead to different probs. + $cmd $dir/log/compute_prob_valid.final.log \ + nnet3-chain-compute-prob --l2-regularize=$l2_regularize --leaky-hmm-coefficient=$leaky_hmm_coefficient --xent-regularize=$xent_regularize \ + "nnet3-am-copy --raw=true $dir/final.mdl - |" $dir/den.fst \ + "ark,bg:nnet3-chain-merge-egs ark:$egs_dir/valid_diagnostic.cegs ark:- |" & + $cmd $dir/log/compute_prob_train.final.log \ + nnet3-chain-compute-prob --l2-regularize=$l2_regularize --leaky-hmm-coefficient=$leaky_hmm_coefficient --xent-regularize=$xent_regularize \ + "nnet3-am-copy --raw=true $dir/final.mdl - |" $dir/den.fst \ + "ark,bg:nnet3-chain-merge-egs ark:$egs_dir/train_diagnostic.cegs ark:- |" & +fi + +if [ ! -f $dir/final.mdl ]; then + echo "$0: $dir/final.mdl does not exist." + # we don't want to clean up if the training didn't succeed. + exit 1; +fi + +sleep 2 + +echo Done + +if $cleanup; then + echo Cleaning up data + if $remove_egs && [[ $egs_dir =~ $dir/egs* ]]; then + steps/nnet2/remove_egs.sh $egs_dir + fi + + echo Removing most of the models + for x in `seq 0 $num_iters`; do + if [ $[$x%100] -ne 0 ] && [ $x -ne $num_iters ] && [ -f $dir/$x.mdl ]; then + # delete all but every 100th model; don't delete the ones which combine to form the final model. + rm $dir/$x.mdl + fi + done +fi diff --git a/egs/wsj/s5/steps/nnet3/components.py b/egs/wsj/s5/steps/nnet3/components.py new file mode 100644 index 00000000000..e9723c392cc --- /dev/null +++ b/egs/wsj/s5/steps/nnet3/components.py @@ -0,0 +1,442 @@ +#!/usr/bin/env python + +from __future__ import print_function +import os +import argparse +import sys +import warnings +import copy +from operator import itemgetter + +def GetSumDescriptor(inputs): + sum_descriptors = inputs + while len(sum_descriptors) != 1: + cur_sum_descriptors = [] + pair = [] + while len(sum_descriptors) > 0: + value = sum_descriptors.pop() + if value.strip() != '': + pair.append(value) + if len(pair) == 2: + cur_sum_descriptors.append("Sum({0}, {1})".format(pair[0], pair[1])) + pair = [] + if pair: + cur_sum_descriptors.append(pair[0]) + sum_descriptors = cur_sum_descriptors + return sum_descriptors + +# adds the input nodes and returns the descriptor +def AddInputLayer(config_lines, feat_dim, splice_indexes=[0], ivector_dim=0): + components = config_lines['components'] + component_nodes = config_lines['component-nodes'] + output_dim = 0 + components.append('input-node name=input dim=' + str(feat_dim)) + list = [('Offset(input, {0})'.format(n) if n != 0 else 'input') for n in splice_indexes] + output_dim += len(splice_indexes) * feat_dim + if ivector_dim > 0: + components.append('input-node name=ivector dim=' + str(ivector_dim)) + list.append('ReplaceIndex(ivector, t, 0)') + output_dim += ivector_dim + if len(list) > 1: + splice_descriptor = "Append({0})".format(", ".join(list)) + else: + splice_descriptor = list[0] + print(splice_descriptor) + return {'descriptor': splice_descriptor, + 'dimension': output_dim} + +def AddNoOpLayer(config_lines, name, input): + components = config_lines['components'] + component_nodes = config_lines['component-nodes'] + + components.append('component name={0}_noop type=NoOpComponent dim={1}'.format(name, input['dimension'])) + component_nodes.append('component-node name={0}_noop component={0}_noop input={1}'.format(name, input['descriptor'])) + + return {'descriptor': '{0}_noop'.format(name), + 'dimension': input['dimension']} + +def AddLdaLayer(config_lines, name, input, lda_file): + return AddFixedAffineLayer(config_lines, name, input, lda_file) + +def AddFixedAffineLayer(config_lines, name, input, matrix_file): + components = config_lines['components'] + component_nodes = config_lines['component-nodes'] + + components.append('component name={0}_fixaffine type=FixedAffineComponent matrix={1}'.format(name, matrix_file)) + component_nodes.append('component-node name={0}_fixaffine component={0}_fixaffine input={1}'.format(name, input['descriptor'])) + + return {'descriptor': '{0}_fixaffine'.format(name), + 'dimension': input['dimension']} + + +def AddBlockAffineLayer(config_lines, name, input, output_dim, num_blocks): + components = config_lines['components'] + component_nodes = config_lines['component-nodes'] + assert((input['dimension'] % num_blocks == 0) and + (output_dim % num_blocks == 0)) + components.append('component name={0}_block_affine type=BlockAffineComponent input-dim={1} output-dim={2} num-blocks={3}'.format(name, input['dimension'], output_dim, num_blocks)) + component_nodes.append('component-node name={0}_block_affine component={0}_block_affine input={1}'.format(name, input['descriptor'])) + + return {'descriptor' : '{0}_block_affine'.format(name), + 'dimension' : output_dim} + +def AddPermuteLayer(config_lines, name, input, column_map): + components = config_lines['components'] + component_nodes = config_lines['component-nodes'] + permute_indexes = ",".join(map(lambda x: str(x), column_map)) + components.append('component name={0}_permute type=PermuteComponent column-map={1}'.format(name, permute_indexes)) + component_nodes.append('component-node name={0}_permute component={0}_permute input={1}'.format(name, input['descriptor'])) + + return {'descriptor': '{0}_permute'.format(name), + 'dimension': input['dimension']} + +def AddAffineLayer(config_lines, name, input, output_dim, ng_affine_options = ""): + components = config_lines['components'] + component_nodes = config_lines['component-nodes'] + + components.append("component name={0}_affine type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, input['dimension'], output_dim, ng_affine_options)) + component_nodes.append("component-node name={0}_affine component={0}_affine input={1}".format(name, input['descriptor'])) + + return {'descriptor': '{0}_affine'.format(name), + 'dimension': output_dim} + +def AddAffRelNormLayer(config_lines, name, input, output_dim, ng_affine_options = " bias-stddev=0 ", norm_target_rms = 1.0, self_repair_scale = None): + components = config_lines['components'] + component_nodes = config_lines['component-nodes'] + + self_repair_string = "self-repair-scale={0:.10f}".format(self_repair_scale) if self_repair_scale is not None else '' + components.append("component name={0}_affine type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, input['dimension'], output_dim, ng_affine_options)) + components.append("component name={0}_relu type=RectifiedLinearComponent dim={1} {2}".format(name, output_dim, self_repair_string)) + components.append("component name={0}_renorm type=NormalizeComponent dim={1} target-rms={2}".format(name, output_dim, norm_target_rms)) + + component_nodes.append("component-node name={0}_affine component={0}_affine input={1}".format(name, input['descriptor'])) + component_nodes.append("component-node name={0}_relu component={0}_relu input={0}_affine".format(name)) + component_nodes.append("component-node name={0}_renorm component={0}_renorm input={0}_relu".format(name)) + + return {'descriptor': '{0}_renorm'.format(name), + 'dimension': output_dim} + +def AddConvolutionLayer(config_lines, name, input, + input_x_dim, input_y_dim, input_z_dim, + filt_x_dim, filt_y_dim, + filt_x_step, filt_y_step, + num_filters, input_vectorization, + param_stddev = None, bias_stddev = None, + filter_bias_file = None, + is_updatable = True): + assert(input['dimension'] == input_x_dim * input_y_dim * input_z_dim) + components = config_lines['components'] + component_nodes = config_lines['component-nodes'] + + conv_init_string = ("component name={name}_conv type=ConvolutionComponent " + "input-x-dim={input_x_dim} input-y-dim={input_y_dim} input-z-dim={input_z_dim} " + "filt-x-dim={filt_x_dim} filt-y-dim={filt_y_dim} " + "filt-x-step={filt_x_step} filt-y-step={filt_y_step} " + "input-vectorization-order={vector_order}".format(name = name, + input_x_dim = input_x_dim, input_y_dim = input_y_dim, input_z_dim = input_z_dim, + filt_x_dim = filt_x_dim, filt_y_dim = filt_y_dim, + filt_x_step = filt_x_step, filt_y_step = filt_y_step, + vector_order = input_vectorization)) + if filter_bias_file is not None: + conv_init_string += " matrix={0}".format(filter_bias_file) + else: + conv_init_string += " num-filters={0}".format(num_filters) + + components.append(conv_init_string) + component_nodes.append("component-node name={0}_conv_t component={0}_conv input={1}".format(name, input['descriptor'])) + + num_x_steps = (1 + (input_x_dim - filt_x_dim) / filt_x_step) + num_y_steps = (1 + (input_y_dim - filt_y_dim) / filt_y_step) + output_dim = num_x_steps * num_y_steps * num_filters; + return {'descriptor': '{0}_conv_t'.format(name), + 'dimension': output_dim, + '3d-dim': [num_x_steps, num_y_steps, num_filters], + 'vectorization': 'zyx'} + +# The Maxpooling component assumes input vectorizations of type zyx +def AddMaxpoolingLayer(config_lines, name, input, + input_x_dim, input_y_dim, input_z_dim, + pool_x_size, pool_y_size, pool_z_size, + pool_x_step, pool_y_step, pool_z_step): + if input_x_dim < 1 or input_y_dim < 1 or input_z_dim < 1: + raise Exception("non-positive maxpooling input size ({0}, {1}, {2})". + format(input_x_dim, input_y_dim, input_z_dim)) + if pool_x_size > input_x_dim or pool_y_size > input_y_dim or pool_z_size > input_z_dim: + raise Exception("invalid maxpooling pool size vs. input size") + if pool_x_step > pool_x_size or pool_y_step > pool_y_size or pool_z_step > pool_z_size: + raise Exception("invalid maxpooling pool step vs. pool size") + + assert(input['dimension'] == input_x_dim * input_y_dim * input_z_dim) + components = config_lines['components'] + component_nodes = config_lines['component-nodes'] + + components.append('component name={name}_maxp type=MaxpoolingComponent ' + 'input-x-dim={input_x_dim} input-y-dim={input_y_dim} input-z-dim={input_z_dim} ' + 'pool-x-size={pool_x_size} pool-y-size={pool_y_size} pool-z-size={pool_z_size} ' + 'pool-x-step={pool_x_step} pool-y-step={pool_y_step} pool-z-step={pool_z_step} '. + format(name = name, + input_x_dim = input_x_dim, input_y_dim = input_y_dim, input_z_dim = input_z_dim, + pool_x_size = pool_x_size, pool_y_size = pool_y_size, pool_z_size = pool_z_size, + pool_x_step = pool_x_step, pool_y_step = pool_y_step, pool_z_step = pool_z_step)) + + component_nodes.append('component-node name={0}_maxp_t component={0}_maxp input={1}'.format(name, input['descriptor'])) + + num_pools_x = 1 + (input_x_dim - pool_x_size) / pool_x_step; + num_pools_y = 1 + (input_y_dim - pool_y_size) / pool_y_step; + num_pools_z = 1 + (input_z_dim - pool_z_size) / pool_z_step; + output_dim = num_pools_x * num_pools_y * num_pools_z; + + return {'descriptor': '{0}_maxp_t'.format(name), + 'dimension': output_dim, + '3d-dim': [num_pools_x, num_pools_y, num_pools_z], + 'vectorization': 'zyx'} + + +def AddSoftmaxLayer(config_lines, name, input): + components = config_lines['components'] + component_nodes = config_lines['component-nodes'] + + components.append("component name={0}_log_softmax type=LogSoftmaxComponent dim={1}".format(name, input['dimension'])) + component_nodes.append("component-node name={0}_log_softmax component={0}_log_softmax input={1}".format(name, input['descriptor'])) + + return {'descriptor': '{0}_log_softmax'.format(name), + 'dimension': input['dimension']} + +def AddPerDimAffineLayer(config_lines, name, input, input_window): + components = config_lines['components'] + component_nodes = config_lines['component-nodes'] + + filter_context = int((input_window - 1) / 2) + filter_input_splice_indexes = range(-1 * filter_context, filter_context + 1) + list = [('Offset({0}, {1})'.format(input['descriptor'], n) if n != 0 else input['descriptor']) for n in filter_input_splice_indexes] + filter_input_descriptor = 'Append({0})'.format(' , '.join(list)) + filter_input_descriptor = {'descriptor':filter_input_descriptor, + 'dimension':len(filter_input_splice_indexes) * input['dimension']} + + # add permute component to shuffle the feature columns of the Append + # descriptor output so that columns corresponding to the same feature index + # are contiguous add a block-affine component to collapse all the feature + # indexes across time steps into a single value + num_feats = input['dimension'] + num_times = len(filter_input_splice_indexes) + column_map = [] + for i in range(num_feats): + for j in range(num_times): + column_map.append(j * num_feats + i) + + composite_config_lines = {'components':[], 'component-nodes':[]} + + permuted_output_descriptor = AddPermuteLayer(composite_config_lines, + name, filter_input_descriptor, column_map) + + # add a block-affine component + output_descriptor = AddBlockAffineLayer(composite_config_lines, name, + permuted_output_descriptor, + num_feats, num_feats) + + + # strip names + ccl = composite_config_lines['components'] + composite_config_line = '' + for index in range(len(ccl)): + parts = ccl[index].split() + assert(parts[0] == "component" and parts[1].split('=')[0] == "name") + composite_config_line += " component{0}='{1}'".format(index+1, " ".join(parts[2:])) + + components.append("component name={name} type=CompositeComponent num-components={nc} {rest}".format(name = '{0}_PDA'.format(name), + nc = len(ccl), + rest = composite_config_line)) + component_nodes.append("component-node name={0}_PDA component={0}_PDA input={1}".format(name, filter_input_descriptor['descriptor'])) + return [{'descriptor': '{0}_PDA'.format(name), + 'dimension': output_descriptor['dimension'] + }, filter_context, filter_context] + +def AddSigmoidLayer(config_lines, name, input, self_repair_scale = None): + components = config_lines['components'] + component_nodes = config_lines['component-nodes'] + + self_repair_string = "self-repair-scale={0:.10f}".format(self_repair_scale) if self_repair_scale is not None else '' + components.append("component name={0}_sigmoid type=SigmoidComponent dim={1}".format(name, input['dimension'], self_repair_string)) + component_nodes.append("component-node name={0}_sigmoid component={0}_sigmoid input={1}".format(name, input['descriptor'])) + return {'descriptor': '{0}_sigmoid'.format(name), + 'dimension': input['dimension']} + +def AddOutputLayer(config_lines, input, label_delay = None, suffix=None, objective_type = "linear"): + components = config_lines['components'] + component_nodes = config_lines['component-nodes'] + name = 'output' + if suffix is not None: + name = '{0}-{1}'.format(name, suffix) + + if label_delay is None: + component_nodes.append('output-node name={0} input={1} objective={2}'.format(name, input['descriptor'], objective_type)) + else: + component_nodes.append('output-node name={0} input=Offset({1},{2}) objective={3}'.format(name, input['descriptor'], label_delay, objective_type)) + +def AddFinalLayer(config_lines, input, output_dim, + ng_affine_options = " param-stddev=0 bias-stddev=0 ", + label_delay=None, + use_presoftmax_prior_scale = False, + prior_scale_file = None, + include_log_softmax = True, + add_final_sigmoid = False, + name_affix = None, + objective_type = "linear"): + components = config_lines['components'] + component_nodes = config_lines['component-nodes'] + + if name_affix is not None: + final_node_prefix = 'Final-' + str(name_affix) + else: + final_node_prefix = 'Final' + + prev_layer_output = AddAffineLayer(config_lines, + final_node_prefix , input, output_dim, + ng_affine_options) + if include_log_softmax: + if use_presoftmax_prior_scale : + components.append('component name={0}-fixed-scale type=FixedScaleComponent scales={1}'.format(final_node_prefix, prior_scale_file)) + component_nodes.append('component-node name={0}-fixed-scale component={0}-fixed-scale input={1}'.format(final_node_prefix, + prev_layer_output['descriptor'])) + prev_layer_output['descriptor'] = "{0}-fixed-scale".format(final_node_prefix) + prev_layer_output = AddSoftmaxLayer(config_lines, final_node_prefix, prev_layer_output) + elif add_final_sigmoid: + # Useful when you need the final outputs to be probabilities + # between 0 and 1. + # Usually used with an objective-type such as "quadratic" + prev_layer_output = AddSigmoidLayer(config_lines, final_node_prefix, prev_layer_output) + # we use the same name_affix as a prefix in for affine/scale nodes but as a + # suffix for output node + AddOutputLayer(config_lines, prev_layer_output, label_delay, suffix = name_affix, objective_type = objective_type) + +def AddLstmLayer(config_lines, + name, input, cell_dim, + recurrent_projection_dim = 0, + non_recurrent_projection_dim = 0, + clipping_threshold = 1.0, + norm_based_clipping = "false", + ng_per_element_scale_options = "", + ng_affine_options = "", + lstm_delay = -1, + self_repair_scale = None): + assert(recurrent_projection_dim >= 0 and non_recurrent_projection_dim >= 0) + components = config_lines['components'] + component_nodes = config_lines['component-nodes'] + + input_descriptor = input['descriptor'] + input_dim = input['dimension'] + name = name.strip() + + if (recurrent_projection_dim == 0): + add_recurrent_projection = False + recurrent_projection_dim = cell_dim + recurrent_connection = "m_t" + else: + add_recurrent_projection = True + recurrent_connection = "r_t" + if (non_recurrent_projection_dim == 0): + add_non_recurrent_projection = False + else: + add_non_recurrent_projection = True + + self_repair_string = "self-repair-scale={0:.10f}".format(self_repair_scale) if self_repair_scale is not None else '' + # Natural gradient per element scale parameters + ng_per_element_scale_options += " param-mean=0.0 param-stddev=1.0 " + # Parameter Definitions W*(* replaced by - to have valid names) + components.append("# Input gate control : W_i* matrices") + components.append("component name={0}_W_i-xr type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, input_dim + recurrent_projection_dim, cell_dim, ng_affine_options)) + components.append("# note : the cell outputs pass through a diagonal matrix") + components.append("component name={0}_w_ic type=NaturalGradientPerElementScaleComponent dim={1} {2}".format(name, cell_dim, ng_per_element_scale_options)) + + components.append("# Forget gate control : W_f* matrices") + components.append("component name={0}_W_f-xr type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, input_dim + recurrent_projection_dim, cell_dim, ng_affine_options)) + components.append("# note : the cell outputs pass through a diagonal matrix") + components.append("component name={0}_w_fc type=NaturalGradientPerElementScaleComponent dim={1} {2}".format(name, cell_dim, ng_per_element_scale_options)) + + components.append("# Output gate control : W_o* matrices") + components.append("component name={0}_W_o-xr type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, input_dim + recurrent_projection_dim, cell_dim, ng_affine_options)) + components.append("# note : the cell outputs pass through a diagonal matrix") + components.append("component name={0}_w_oc type=NaturalGradientPerElementScaleComponent dim={1} {2}".format(name, cell_dim, ng_per_element_scale_options)) + + components.append("# Cell input matrices : W_c* matrices") + components.append("component name={0}_W_c-xr type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, input_dim + recurrent_projection_dim, cell_dim, ng_affine_options)) + + + components.append("# Defining the non-linearities") + components.append("component name={0}_i type=SigmoidComponent dim={1} {2}".format(name, cell_dim, self_repair_string)) + components.append("component name={0}_f type=SigmoidComponent dim={1} {2}".format(name, cell_dim, self_repair_string)) + components.append("component name={0}_o type=SigmoidComponent dim={1} {2}".format(name, cell_dim, self_repair_string)) + components.append("component name={0}_g type=TanhComponent dim={1} {2}".format(name, cell_dim, self_repair_string)) + components.append("component name={0}_h type=TanhComponent dim={1} {2}".format(name, cell_dim, self_repair_string)) + + components.append("# Defining the cell computations") + components.append("component name={0}_c1 type=ElementwiseProductComponent input-dim={1} output-dim={2}".format(name, 2 * cell_dim, cell_dim)) + components.append("component name={0}_c2 type=ElementwiseProductComponent input-dim={1} output-dim={2}".format(name, 2 * cell_dim, cell_dim)) + components.append("component name={0}_m type=ElementwiseProductComponent input-dim={1} output-dim={2}".format(name, 2 * cell_dim, cell_dim)) + components.append("component name={0}_c type=ClipGradientComponent dim={1} clipping-threshold={2} norm-based-clipping={3} ".format(name, cell_dim, clipping_threshold, norm_based_clipping)) + + # c1_t and c2_t defined below + component_nodes.append("component-node name={0}_c_t component={0}_c input=Sum({0}_c1_t, {0}_c2_t)".format(name)) + c_tminus1_descriptor = "IfDefined(Offset({0}_c_t, {1}))".format(name, lstm_delay) + + component_nodes.append("# i_t") + component_nodes.append("component-node name={0}_i1 component={0}_W_i-xr input=Append({1}, IfDefined(Offset({0}_{2}, {3})))".format(name, input_descriptor, recurrent_connection, lstm_delay)) + component_nodes.append("component-node name={0}_i2 component={0}_w_ic input={1}".format(name, c_tminus1_descriptor)) + component_nodes.append("component-node name={0}_i_t component={0}_i input=Sum({0}_i1, {0}_i2)".format(name)) + + component_nodes.append("# f_t") + component_nodes.append("component-node name={0}_f1 component={0}_W_f-xr input=Append({1}, IfDefined(Offset({0}_{2}, {3})))".format(name, input_descriptor, recurrent_connection, lstm_delay)) + component_nodes.append("component-node name={0}_f2 component={0}_w_fc input={1}".format(name, c_tminus1_descriptor)) + component_nodes.append("component-node name={0}_f_t component={0}_f input=Sum({0}_f1,{0}_f2)".format(name)) + + component_nodes.append("# o_t") + component_nodes.append("component-node name={0}_o1 component={0}_W_o-xr input=Append({1}, IfDefined(Offset({0}_{2}, {3})))".format(name, input_descriptor, recurrent_connection, lstm_delay)) + component_nodes.append("component-node name={0}_o2 component={0}_w_oc input={0}_c_t".format(name)) + component_nodes.append("component-node name={0}_o_t component={0}_o input=Sum({0}_o1, {0}_o2)".format(name)) + + component_nodes.append("# h_t") + component_nodes.append("component-node name={0}_h_t component={0}_h input={0}_c_t".format(name)) + + component_nodes.append("# g_t") + component_nodes.append("component-node name={0}_g1 component={0}_W_c-xr input=Append({1}, IfDefined(Offset({0}_{2}, {3})))".format(name, input_descriptor, recurrent_connection, lstm_delay)) + component_nodes.append("component-node name={0}_g_t component={0}_g input={0}_g1".format(name)) + + component_nodes.append("# parts of c_t") + component_nodes.append("component-node name={0}_c1_t component={0}_c1 input=Append({0}_f_t, {1})".format(name, c_tminus1_descriptor)) + component_nodes.append("component-node name={0}_c2_t component={0}_c2 input=Append({0}_i_t, {0}_g_t)".format(name)) + + component_nodes.append("# m_t") + component_nodes.append("component-node name={0}_m_t component={0}_m input=Append({0}_o_t, {0}_h_t)".format(name)) + + # add the recurrent connections + if (add_recurrent_projection and add_non_recurrent_projection): + components.append("# projection matrices : Wrm and Wpm") + components.append("component name={0}_W-m type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, cell_dim, recurrent_projection_dim + non_recurrent_projection_dim, ng_affine_options)) + components.append("component name={0}_r type=ClipGradientComponent dim={1} clipping-threshold={2} norm-based-clipping={3} ".format(name, recurrent_projection_dim, clipping_threshold, norm_based_clipping)) + component_nodes.append("# r_t and p_t") + component_nodes.append("component-node name={0}_rp_t component={0}_W-m input={0}_m_t".format(name)) + component_nodes.append("dim-range-node name={0}_r_t_preclip input-node={0}_rp_t dim-offset=0 dim={1}".format(name, recurrent_projection_dim)) + component_nodes.append("component-node name={0}_r_t component={0}_r input={0}_r_t_preclip".format(name)) + output_descriptor = '{0}_rp_t'.format(name) + output_dim = recurrent_projection_dim + non_recurrent_projection_dim + + elif add_recurrent_projection: + components.append("# projection matrices : Wrm") + components.append("component name={0}_Wrm type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, cell_dim, recurrent_projection_dim, ng_affine_options)) + components.append("component name={0}_r type=ClipGradientComponent dim={1} clipping-threshold={2} norm-based-clipping={3} ".format(name, recurrent_projection_dim, clipping_threshold, norm_based_clipping)) + component_nodes.append("# r_t") + component_nodes.append("component-node name={0}_r_t_preclip component={0}_Wrm input={0}_m_t".format(name)) + component_nodes.append("component-node name={0}_r_t component={0}_r input={0}_r_t_preclip".format(name)) + output_descriptor = '{0}_r_t'.format(name) + output_dim = recurrent_projection_dim + + else: + components.append("component name={0}_r type=ClipGradientComponent dim={1} clipping-threshold={2} norm-based-clipping={3} ".format(name, cell_dim, clipping_threshold, norm_based_clipping)) + component_nodes.append("component-node name={0}_r_t component={0}_r input={0}_m_t".format(name)) + output_descriptor = '{0}_r_t'.format(name) + output_dim = cell_dim + + return { + 'descriptor': output_descriptor, + 'dimension':output_dim + } diff --git a/egs/wsj/s5/steps/nnet3/decode.sh b/egs/wsj/s5/steps/nnet3/decode.sh index 880ddd11f48..151dd6be2e7 100755 --- a/egs/wsj/s5/steps/nnet3/decode.sh +++ b/egs/wsj/s5/steps/nnet3/decode.sh @@ -12,8 +12,11 @@ stage=1 transform_dir= # dir to find fMLLR transforms. nj=4 # number of decoding jobs. If --transform-dir set, must match that number! acwt=0.1 # Just a default value, used for adaptation and beam-pruning.. +post_decode_acwt=1.0 # can be used in 'chain' systems to scale acoustics by 10 so the + # regular scoring script works. cmd=run.pl beam=15.0 +frames_per_chunk=50 max_active=7000 min_active=200 ivector_scale=1.0 @@ -23,6 +26,10 @@ num_threads=1 # if >1, will use gmm-latgen-faster-parallel parallel_opts= # ignored now. scoring_opts= skip_scoring=false +extra_left_context=0 +extra_right_context=0 +extra_left_context_initial=-1 +extra_right_context_final=-1 feat_type= online_ivector_dir= minimize=false @@ -69,7 +76,7 @@ done sdata=$data/split$nj; cmvn_opts=`cat $srcdir/cmvn_opts` || exit 1; thread_string= -[ $num_threads -gt 1 ] && thread_string="-parallel --num-threads=$num_threads" +[ $num_threads -gt 1 ] && thread_string="-parallel --num-threads=$num_threads" mkdir -p $dir/log [[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1; @@ -95,7 +102,7 @@ if [ ! -z "$transform_dir" ]; then [ ! -s $transform_dir/num_jobs ] && \ echo "$0: expected $transform_dir/num_jobs to contain the number of jobs." && exit 1; nj_orig=$(cat $transform_dir/num_jobs) - + if [ $feat_type == "raw" ]; then trans=raw_trans; else trans=trans; fi if [ $feat_type == "lda" ] && \ @@ -125,19 +132,36 @@ fi if [ ! -z "$online_ivector_dir" ]; then ivector_period=$(cat $online_ivector_dir/ivector_period) || exit 1; - ivector_opts="--online-ivectors=scp:$online_ivector_dir/ivector_online.scp --online-ivector_period=$ivector_period" + ivector_opts="--online-ivectors=scp:$online_ivector_dir/ivector_online.scp --online-ivector-period=$ivector_period" +fi + +if [ "$post_decode_acwt" == 1.0 ]; then + lat_wspecifier="ark:|gzip -c >$dir/lat.JOB.gz" +else + lat_wspecifier="ark:|lattice-scale --acoustic-scale=$post_decode_acwt ark:- ark:- | gzip -c >$dir/lat.JOB.gz" +fi + +frame_subsampling_opt= +if [ -f $srcdir/frame_subsampling_factor ]; then + # e.g. for 'chain' systems + frame_subsampling_opt="--frame-subsampling-factor=$(cat $srcdir/frame_subsampling_factor)" fi if [ $stage -le 1 ]; then $cmd --num-threads $num_threads JOB=1:$nj $dir/log/decode.JOB.log \ - nnet3-latgen-faster$thread_string $ivector_opts \ + nnet3-latgen-faster$thread_string $ivector_opts $frame_subsampling_opt \ + --frames-per-chunk=$frames_per_chunk \ + --extra-left-context=$extra_left_context \ + --extra-right-context=$extra_right_context \ + --extra-left-context-initial=$extra_left_context_initial \ + --extra-right-context-final=$extra_right_context_final \ --minimize=$minimize --max-active=$max_active --min-active=$min_active --beam=$beam \ --lattice-beam=$lattice_beam --acoustic-scale=$acwt --allow-partial=true \ --word-symbol-table=$graphdir/words.txt "$model" \ - $graphdir/HCLG.fst "$feats" "ark:|gzip -c > $dir/lat.JOB.gz" || exit 1; + $graphdir/HCLG.fst "$feats" "$lat_wspecifier" || exit 1; fi -# The output of this script is the files "lat.*.gz"-- we'll rescore this at +# The output of this script is the files "lat.*.gz"-- we'll rescore this at # different acoustic scales to get the final output. @@ -146,7 +170,8 @@ if [ $stage -le 2 ]; then [ ! -x local/score.sh ] && \ echo "Not scoring because local/score.sh does not exist or not executable." && exit 1; echo "score best paths" - local/score.sh $scoring_opts --cmd "$cmd" $data $graphdir $dir + [ "$iter" != "final" ] && iter_opt="--iter $iter" + local/score.sh $iter_opt $scoring_opts --cmd "$cmd" $data $graphdir $dir echo "score confidence and timing with sclite" fi fi diff --git a/egs/wsj/s5/steps/nnet3/dot/nnet3_to_dot.py b/egs/wsj/s5/steps/nnet3/dot/nnet3_to_dot.py index 88cf54e824e..2290c4d2e7f 100755 --- a/egs/wsj/s5/steps/nnet3/dot/nnet3_to_dot.py +++ b/egs/wsj/s5/steps/nnet3/dot/nnet3_to_dot.py @@ -34,6 +34,11 @@ 'shape':'box', 'style':'filled' }, + 'ConvolutionComponent':{ + 'color':'lightpink', + 'shape':'box', + 'style':'filled' + }, 'FixedScaleComponent':{ 'color':'blueviolet', 'shape':'box', @@ -64,6 +69,11 @@ 'shape':'rectangle', 'style':'filled' }, + 'ClipGradientComponent':{ + 'color':'bisque', + 'shape':'rectangle', + 'style':'filled' + }, 'ElementwiseProductComponent':{ 'color':'green', 'shape':'rectangle', @@ -84,10 +94,10 @@ def GetDotNodeName(name_string, is_component = False): # 2. Nnet3 names can be shared among components and component nodes # dot does not allow common names # - name_string = re.sub("-", "hyphen", name_string) + node_name_string = re.sub("-", "hyphen", name_string) if is_component: - name_string += name_string.strip() + "_component" - return name_string + node_name_string += node_name_string.strip() + "_component" + return {"label":name_string, "node":node_name_string} def ProcessAppendDescriptor(segment, parent_node_name, affix, edge_attributes = None): dot_graph = [] @@ -96,18 +106,18 @@ def ProcessAppendDescriptor(segment, parent_node_name, affix, edge_attributes = for i in range(len(segment['sub_segments'])): sub_segment = segment['sub_segments'][i] part_name = "{0}{1}{2}".format(desc_name, sub_segment['name'], i) - names.append("<{0}> part {1}".format(GetDotNodeName(part_name), i)) + names.append("<{0}> part {1}".format(GetDotNodeName(part_name)['node'], i)) dot_graph += DescriptorSegmentToDot(sub_segment, "{0}:{1}".format(desc_name, part_name), desc_name) part_index = len(segment['sub_segments']) for i in range(len(segment['arguments'])): part_name = "{0}{1}{2}".format(desc_name, segment['arguments'][i], part_index + i) - names.append("<{0}> part {1}".format(GetDotNodeName(part_name), part_index + i)) - dot_graph.append("{0} -> {1}:{2}".format(GetDotNodeName(segment['arguments'][i]), GetDotNodeName(desc_name), GetDotNodeName(part_name))) + names.append("<{0}> part {1}".format(GetDotNodeName(part_name)['node'], part_index + i)) + dot_graph.append("{0} -> {1}:{2}".format(GetDotNodeName(segment['arguments'][i])['node'], GetDotNodeName(desc_name)['node'], GetDotNodeName(part_name)['node'])) label = "|".join(names) label = "{{"+label+"}|Append}" - dot_graph.append('{0} [shape=Mrecord, label="{1}"];'.format(GetDotNodeName(desc_name), label)) + dot_graph.append('{0} [shape=Mrecord, label="{1}"];'.format(GetDotNodeName(desc_name)['node'], label)) attr_string = '' if edge_attributes is not None: @@ -116,7 +126,7 @@ def ProcessAppendDescriptor(segment, parent_node_name, affix, edge_attributes = if edge_attributes.has_key('style'): attr_string += ' style={0} '.format(edge_attributes['style']) - dot_string = '{0} -> {1} [tailport=s]'.format(GetDotNodeName(desc_name), GetDotNodeName(parent_node_name)) + dot_string = '{0} -> {1} [tailport=s]'.format(GetDotNodeName(desc_name)['node'], GetDotNodeName(parent_node_name)['node']) if attr_string != '': dot_string += ' [{0}] '.format(attr_string) @@ -125,6 +135,28 @@ def ProcessAppendDescriptor(segment, parent_node_name, affix, edge_attributes = return dot_graph +def ProcessRoundDescriptor(segment, parent_node_name, affix, edge_attributes = None): + dot_graph = [] + + label = 'Round ({0})'.format(segment['arguments'][1]) + style = None + if edge_attributes is not None: + if edge_attributes.has_key('label'): + label = "{0} {1}".format(edge_attributes['label'], label) + if edge_attributes.has_key('style'): + style = 'style={0}'.format(edge_attributes['style']) + + attr_string = 'label="{0}"'.format(label) + if style is not None: + attr_string += ' {0}'.format(style) + dot_graph.append('{0}->{1} [ {2} ]'.format(GetDotNodeName(segment['arguments'][0])['node'], + GetDotNodeName(parent_node_name)['node'], + attr_string)) + if segment['sub_segments']: + raise Exception("Round can just deal with forwarding descriptor, no sub-segments allowed") + return dot_graph + + def ProcessOffsetDescriptor(segment, parent_node_name, affix, edge_attributes = None): dot_graph = [] @@ -140,8 +172,8 @@ def ProcessOffsetDescriptor(segment, parent_node_name, affix, edge_attributes = if style is not None: attr_string += ' {0}'.format(style) - dot_graph.append('{0}->{1} [ {2} ]'.format(GetDotNodeName(segment['arguments'][0]), - GetDotNodeName(parent_node_name), + dot_graph.append('{0}->{1} [ {2} ]'.format(GetDotNodeName(segment['arguments'][0])['node'], + GetDotNodeName(parent_node_name)['node'], attr_string)) if segment['sub_segments']: raise Exception("Offset can just deal with forwarding descriptor, no sub-segments allowed") @@ -151,21 +183,23 @@ def ProcessSumDescriptor(segment, parent_node_name, affix, edge_attributes = Non dot_graph = [] names = [] desc_name = 'Sum_{0}'.format(affix) + # create the sum node for i in range(len(segment['sub_segments'])): sub_segment = segment['sub_segments'][i] part_name = "{0}{1}{2}".format(desc_name, sub_segment['name'], i) - names.append("<{0}> part {1}".format(GetDotNodeName(part_name), i)) - dot_graph += DescriptorSegmentToDot(sub_segment, "{0}:{1}".format(desc_name, part_name), desc_name) + names.append("<{0}> part {1}".format(GetDotNodeName(part_name)['node'], i)) + dot_graph += DescriptorSegmentToDot(sub_segment, "{0}:{1}".format(desc_name, part_name), desc_name+"_"+str(i)) + # link the sum node parts to corresponding segments part_index = len(segment['sub_segments']) for i in range(len(segment['arguments'])): part_name = "{0}{1}{2}".format(desc_name, segment['arguments'][i], part_index + i) - names.append("<{0}> part {1}".format(GetDotNodeName(part_name), part_index + i)) - dot_graph.append("{0} -> {1}:{2}".format(GetDotNodeName(segment['arguments'][i]), GetDotNodeName(desc_name), GetDotNodeName(part_name))) + names.append("<{0}> part {1}".format(GetDotNodeName(part_name)['node'], part_index + i)) + dot_graph.append("{0} -> {1}:{2}".format(GetDotNodeName(segment['arguments'][i])['node'], GetDotNodeName(desc_name)['node'], GetDotNodeName(part_name)['node'])) label = "|".join(names) label = '{{'+label+'}|Sum}' - dot_graph.append('{0} [shape=Mrecord, label="{1}", color=red];'.format(GetDotNodeName(desc_name), label)) + dot_graph.append('{0} [shape=Mrecord, label="{1}", color=red];'.format(GetDotNodeName(desc_name)['node'], label)) attr_string = '' if edge_attributes is not None: @@ -174,7 +208,7 @@ def ProcessSumDescriptor(segment, parent_node_name, affix, edge_attributes = Non if edge_attributes.has_key('style'): attr_string += ' style={0} '.format(edge_attributes['style']) - dot_string = '{0} -> {1}'.format(GetDotNodeName(desc_name), GetDotNodeName(parent_node_name)) + dot_string = '{0} -> {1}'.format(GetDotNodeName(desc_name)['node'], GetDotNodeName(parent_node_name)['node']) dot_string += ' [{0} tailport=s ] '.format(attr_string) dot_graph.append(dot_string) @@ -195,8 +229,8 @@ def ProcessReplaceIndexDescriptor(segment, parent_node_name, affix, edge_attribu if style is not None: attr_string += ' {0}'.format(style) - dot_graph.append('{0}->{1} [{2}]'.format(GetDotNodeName(segment['arguments'][0]), - GetDotNodeName(parent_node_name), + dot_graph.append('{0}->{1} [{2}]'.format(GetDotNodeName(segment['arguments'][0])['node'], + GetDotNodeName(parent_node_name)['node'], attr_string)) if segment['sub_segments']: raise Exception("ReplaceIndex can just deal with forwarding descriptor, no sub-segments allowed") @@ -215,7 +249,7 @@ def ProcessIfDefinedDescriptor(segment, parent_node_name, affix, edge_attributes dot_graph += DescriptorSegmentToDot(sub_segment, parent_node_name, parent_node_name, edge_attributes={'style':'dotted', 'label':'IfDefined'}) if segment['arguments']: - dot_graph.append('{0} -> {1} [style=dotted, label="IfDefined"]'.format(GetDotNodeName(segment['arguments'][0]), GetDotNodeName(parent_node_name))) + dot_graph.append('{0} -> {1} [style=dotted, label="IfDefined"]'.format(GetDotNodeName(segment['arguments'][0])['node'], GetDotNodeName(parent_node_name)['node'])) return dot_graph @@ -232,6 +266,8 @@ def DescriptorSegmentToDot(segment, parent_node_name, affix, edge_attributes = N dot_graph += ProcessIfDefinedDescriptor(segment, parent_node_name, affix, edge_attributes) elif segment['name'] == "ReplaceIndex": dot_graph += ProcessReplaceIndexDescriptor(segment, parent_node_name, affix, edge_attributes) + elif segment['name'] == "Round": + dot_graph += ProcessRoundDescriptor(segment, parent_node_name, affix, edge_attributes) else: raise Exception('Descriptor {0}, is not recognized by this script. Please add Process{0}Descriptor method'.format(segment['name'])) return dot_graph @@ -244,7 +280,7 @@ def Nnet3DescriptorToDot(descriptor, parent_node_name): dot_lines += DescriptorSegmentToDot(segment, parent_node_name, parent_node_name) elif arguments: assert(len(arguments) == 1) - dot_lines.append("{0} -> {1}".format(GetDotNodeName(arguments[0]), GetDotNodeName(parent_node_name))) + dot_lines.append("{0} -> {1}".format(GetDotNodeName(arguments[0])['node'], GetDotNodeName(parent_node_name)['node'])) return dot_lines def ParseNnet3String(string): @@ -298,27 +334,28 @@ def Nnet3ComponentToDot(component_config, component_attributes = None): except KeyError: pass - return ['{0} [label="{1}" {2}]'.format(GetDotNodeName(component_config['name'], is_component = True), label, attr_string)] + return ['{0} [label="{1}" {2}]'.format(GetDotNodeName(component_config['name'], is_component = True)['node'], label, attr_string)] # input-node name=input dim=40 def Nnet3InputToDot(parsed_config): - return ['{0} [ label="{1}\\ndim={2}"]'.format(GetDotNodeName(parsed_config['name']), parsed_config['name'], parsed_config['dim'] )] + return ['{0} [ label="{1}\\ndim={2}"]'.format(GetDotNodeName(parsed_config['name'])['node'], parsed_config['name'], parsed_config['dim'] )] # output-node name=output input=Final_log_softmax dim=3940 objective=linear +#output-node name=output input=Offset(Final_log_softmax, 5) dim=3940 objective=linear def Nnet3OutputToDot(parsed_config): dot_graph = [] - dot_graph.append('{0} [ label="{1}\\nobjective={2}"]'.format(GetDotNodeName(parsed_config['name']), parsed_config['name'], parsed_config['objective'])) - dot_graph.append('{0} -> {1}'.format(GetDotNodeName(parsed_config['input']), GetDotNodeName(parsed_config['name']))) + dot_graph += Nnet3DescriptorToDot(parsed_config['input'], parsed_config['name']) + dot_graph.append('{0} [ label="{1}\\nobjective={2}"]'.format(GetDotNodeName(parsed_config['name'])['node'], parsed_config['name'], parsed_config['objective'])) return dot_graph # dim-range-node name=Lstm1_r_t input-node=Lstm1_rp_t dim-offset=0 dim=256 def Nnet3DimrangeToDot(parsed_config): dot_graph = [] - dot_graph.append(parsed_config['name']) - dot_graph.append('{0} [shape=rectangle]'.format(GetDotNodeName(parsed_config['name']))) - dot_graph.append('{0} -> {1} [taillabel="dimrange({2}, {3})"]'.format(GetDotNodeName(parsed_config['input-node']), - GetDotNodeName(parsed_config['name']), + dot_node = GetDotNodeName(parsed_config['name']) + dot_graph.append('{0} [shape=rectangle, label="{1}"]'.format(dot_node['node'], dot_node['label'])) + dot_graph.append('{0} -> {1} [taillabel="dimrange({2}, {3})"]'.format(GetDotNodeName(parsed_config['input-node'])['node'], + GetDotNodeName(parsed_config['name'])['node'], parsed_config['dim-offset'], parsed_config['dim'])) return dot_graph @@ -326,9 +363,10 @@ def Nnet3DimrangeToDot(parsed_config): def Nnet3ComponentNodeToDot(parsed_config): dot_graph = [] dot_graph += Nnet3DescriptorToDot(parsed_config['input'], parsed_config['name']) - dot_graph.append('{0} [ label="{1}", shape=box ]'.format(GetDotNodeName(parsed_config['name']), parsed_config['name'])) - dot_graph.append('{0} -> {1} [ weight=10 ]'.format(GetDotNodeName(parsed_config['component'], is_component = True), - GetDotNodeName(parsed_config['name']))) + dot_node = GetDotNodeName(parsed_config['name']) + dot_graph.append('{0} [ label="{1}", shape=box ]'.format(dot_node['node'], dot_node['label'])) + dot_graph.append('{0} -> {1} [ weight=10 ]'.format(GetDotNodeName(parsed_config['component'], is_component = True)['node'], + GetDotNodeName(parsed_config['name'])['node'])) return dot_graph def GroupConfigs(configs, node_prefixes = []): @@ -408,6 +446,8 @@ def ParseConfigLines(lines, node_prefixes = [], component_attributes = None ): " will be clustered together in the dot-graph" " --node-prefixes Lstm1,Lstm2,Layer1", default=None) + parser.add_argument("dotfile", help="name of the dot output file") + print(' '.join(sys.argv), file=sys.stderr) args = parser.parse_args() @@ -420,4 +460,7 @@ def ParseConfigLines(lines, node_prefixes = [], component_attributes = None ): lines = sys.stdin.readlines() dot_graph = ParseConfigLines(lines, component_attributes = component_attributes, node_prefixes = node_prefixes) - print("\n".join(dot_graph)) + + dotfile_handle = open(args.dotfile, "w") + dotfile_handle.write("\n".join(dot_graph)) + dotfile_handle.close() diff --git a/egs/wsj/s5/steps/nnet3/get_egs.sh b/egs/wsj/s5/steps/nnet3/get_egs.sh index 28dc237153e..364f6a72443 100755 --- a/egs/wsj/s5/steps/nnet3/get_egs.sh +++ b/egs/wsj/s5/steps/nnet3/get_egs.sh @@ -19,14 +19,19 @@ feat_type=raw # set it to 'lda' to use LDA features. frames_per_eg=8 # number of frames of labels per example. more->less disk space and # less time preparing egs, but more I/O during training. # note: the script may reduce this if reduce_frames_per_eg is true. -left_context=4 # amount of left-context per eg -right_context=4 # amount of right-context per eg +left_context=4 # amount of left-context per eg (i.e. extra frames of input features + # not present in the output supervision). +right_context=4 # amount of right-context per eg. +valid_left_context= # amount of left_context for validation egs, typically used in + # recurrent architectures to ensure matched condition with + # training egs +valid_right_context= # amount of right_context for validation egs compress=true # set this to false to disable compression (e.g. if you want to see whether # results are affected). reduce_frames_per_eg=true # If true, this script may reduce the frames_per_eg # if there is only one archive and even with the - # reduced frames_pe_eg, the number of + # reduced frames_per_eg, the number of # samples_per_iter that would result is less than or # equal to the user-specified value. num_utts_subset=300 # number of utterances in validation and training @@ -34,15 +39,18 @@ num_utts_subset=300 # number of utterances in validation and training num_valid_frames_combine=0 # #valid frames for combination weights at the very end. num_train_frames_combine=10000 # # train frames for the above. num_frames_diagnostic=4000 # number of frames for "compute_prob" jobs -samples_per_iter=400000 # each iteration of training, see this many samples - # per job. This is just a guideline; it will pick a number - # that divides the number of samples in the entire data. +samples_per_iter=400000 # this is the target number of egs in each archive of egs + # (prior to merging egs). We probably should have called + # it egs_per_iter. This is just a guideline; it will pick + # a number that divides the number of samples in the + # entire data. transform_dir= # If supplied, overrides alidir as the place to find fMLLR transforms -post_dir= # If supplied, we will use posteriors in it as soft training targets. stage=0 -io_opts="-tc 5" # for jobs with a lot of I/O, limits the number running at one time. +nj=6 # This should be set to the maximum number of jobs you are + # comfortable to run in parallel; you can increase it if your disk + # speed is greater and you have more machines. online_ivector_dir= # can be used if we are including speaker information as iVectors. cmvn_opts= # can be used for specifying CMVN options, if feature type is not lda (if lda, # it doesn't make sense to use different options than were used as input to the @@ -53,16 +61,17 @@ echo "$0 $@" # Print the command line for logging if [ -f path.sh ]; then . ./path.sh; fi . parse_options.sh || exit 1; - if [ $# != 3 ]; then echo "Usage: $0 [opts] " echo " e.g.: $0 data/train exp/tri3_ali exp/tri4_nnet/egs" echo "" echo "Main options (for others, see top of script file)" echo " --config # config file containing options" + echo " --nj # The maximum number of jobs you want to run in" + echo " # parallel (increase this only if you have good disk and" + echo " # network speed). default=6" echo " --cmd (utils/run.pl;utils/queue.pl ) # how to run jobs." - echo " --samples-per-iter <#samples;400000> # Number of samples of data to process per iteration, per" - echo " # process." + echo " --samples-per-iter <#samples;400000> # Target number of egs per archive (option is badly named)" echo " --feat-type # (raw is the default). The feature type you want" echo " # to use as input to the neural net." echo " --frames-per-eg # number of frames per eg on disk" @@ -73,7 +82,7 @@ if [ $# != 3 ]; then echo " # very end." echo " --stage # Used to run a partially-completed training process from somewhere in" echo " # the middle." - + exit 1; fi @@ -89,19 +98,19 @@ for f in $data/feats.scp $alidir/ali.1.gz $alidir/final.mdl $alidir/tree $extra_ [ ! -f $f ] && echo "$0: no such file $f" && exit 1; done -nj=`cat $alidir/num_jobs` || exit 1; # number of jobs in alignment dir... - sdata=$data/split$nj utils/split_data.sh $data $nj mkdir -p $dir/log $dir/info cp $alidir/tree $dir -# Get list of validation utterances. +num_ali_jobs=$(cat $alidir/num_jobs) || exit 1; + +# Get list of validation utterances. awk '{print $1}' $data/utt2spk | utils/shuffle_list.pl | head -$num_utts_subset \ > $dir/valid_uttlist || exit 1; -if [ -f $data/utt2uniq ]; then +if [ -f $data/utt2uniq ]; then # this matters if you use data augmentation. echo "File $data/utt2uniq exists, so augmenting valid_uttlist to" echo "include all perturbed versions of the same 'real' utterances." mv $dir/valid_uttlist $dir/valid_uttlist.tmp @@ -117,7 +126,26 @@ awk '{print $1}' $data/utt2spk | utils/filter_scp.pl --exclude $dir/valid_uttlis [ -z "$transform_dir" ] && transform_dir=$alidir -## Set up features. +# because we'll need the features with a different number of jobs than $alidir, +# copy to ark,scp. +if [ -f $transform_dir/trans.1 ] && [ $feat_type != "raw" ]; then + echo "$0: using transforms from $transform_dir" + if [ $stage -le 0 ]; then + $cmd $dir/log/copy_transforms.log \ + copy-feats "ark:cat $transform_dir/trans.* |" "ark,scp:$dir/trans.ark,$dir/trans.scp" + fi +fi +if [ -f $transform_dir/raw_trans.1 ] && [ $feat_type == "raw" ]; then + echo "$0: using raw transforms from $transform_dir" + if [ $stage -le 0 ]; then + $cmd $dir/log/copy_transforms.log \ + copy-feats "ark:cat $transform_dir/raw_trans.* |" "ark,scp:$dir/trans.ark,$dir/trans.scp" + fi +fi + + + +## Set up features. echo "$0: feature type is $feat_type" case $feat_type in @@ -126,7 +154,7 @@ case $feat_type in train_subset_feats="ark,s,cs:utils/filter_scp.pl $dir/train_subset_uttlist $data/feats.scp | apply-cmvn $cmvn_opts --utt2spk=ark:$data/utt2spk scp:$data/cmvn.scp scp:- ark:- |" echo $cmvn_opts >$dir/cmvn_opts # caution: the top-level nnet training script should copy this to its own dir now. ;; - lda) + lda) splice_opts=`cat $alidir/splice_opts 2>/dev/null` # caution: the top-level nnet training script should copy these to its own dir now. cp $alidir/{splice_opts,cmvn_opts,final.mat} $dir || exit 1; @@ -140,32 +168,25 @@ case $feat_type in *) echo "$0: invalid feature type --feat-type '$feat_type'" && exit 1; esac -if [ -f $transform_dir/trans.1 ] && [ $feat_type != "raw" ]; then - echo "$0: using transforms from $transform_dir" - feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$transform_dir/trans.JOB ark:- ark:- |" - valid_feats="$valid_feats transform-feats --utt2spk=ark:$data/utt2spk 'ark:cat $transform_dir/trans.*|' ark:- ark:- |" - train_subset_feats="$train_subset_feats transform-feats --utt2spk=ark:$data/utt2spk 'ark:cat $transform_dir/trans.*|' ark:- ark:- |" -fi -if [ -f $transform_dir/raw_trans.1 ] && [ $feat_type == "raw" ]; then - echo "$0: using raw-fMLLR transforms from $transform_dir" - feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$transform_dir/raw_trans.JOB ark:- ark:- |" - valid_feats="$valid_feats transform-feats --utt2spk=ark:$data/utt2spk 'ark:cat $transform_dir/raw_trans.*|' ark:- ark:- |" - train_subset_feats="$train_subset_feats transform-feats --utt2spk=ark:$data/utt2spk 'ark:cat $transform_dir/raw_trans.*|' ark:- ark:- |" +if [ -f $dir/trans.scp ]; then + feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk scp:$dir/trans.scp ark:- ark:- |" + valid_feats="$valid_feats transform-feats --utt2spk=ark:$data/utt2spk scp:$dir/trans.scp ark:- ark:- |" + train_subset_feats="$train_subset_feats transform-feats --utt2spk=ark:$data/utt2spk scp:$dir/trans.scp ark:- ark:- |" fi if [ ! -z "$online_ivector_dir" ]; then ivector_dim=$(feat-to-dim scp:$online_ivector_dir/ivector_online.scp -) || exit 1; echo $ivector_dim > $dir/info/ivector_dim ivector_period=$(cat $online_ivector_dir/ivector_period) || exit 1; - + ivector_opt="--ivectors='ark,s,cs:utils/filter_scp.pl $sdata/JOB/utt2spk $online_ivector_dir/ivector_online.scp | subsample-feats --n=-$ivector_period scp:- ark:- |'" valid_ivector_opt="--ivectors='ark,s,cs:utils/filter_scp.pl $dir/valid_uttlist $online_ivector_dir/ivector_online.scp | subsample-feats --n=-$ivector_period scp:- ark:- |'" - train_subset_ivector_opt="--ivectors='ark,s,cs:utils/filter_scp.pl $dir/train_subset_uttlist $online_ivector_dir/ivector_online.scp | subsample-feats --n=-$ivector_period scp:- ark:- |'" + train_subset_ivector_opt="--ivectors='ark,s,cs:utils/filter_scp.pl $dir/train_subset_uttlist $online_ivector_dir/ivector_online.scp | subsample-feats --n=-$ivector_period scp:- ark:- |'" else echo 0 >$dir/info/ivector_dim fi -if [ $stage -le 0 ]; then +if [ $stage -le 1 ]; then echo "$0: working out number of frames of training data" num_frames=$(steps/nnet2/get_num_frames.sh $data) echo $num_frames > $dir/info/num_frames @@ -192,10 +213,22 @@ while $reduce_frames_per_eg && [ $frames_per_eg -gt 1 ] && \ done $reduced && echo "$0: reduced frames_per_eg to $frames_per_eg because amount of data is small." +# We may have to first create a smaller number of larger archives, with number +# $num_archives_intermediate, if $num_archives is more than the maximum number +# of open filehandles that the system allows per process (ulimit -n). +max_open_filehandles=$(ulimit -n) || exit 1 +num_archives_intermediate=$num_archives +archives_multiple=1 +while [ $[$num_archives_intermediate+4] -gt $max_open_filehandles ]; do + archives_multiple=$[$archives_multiple+1] + num_archives_intermediate=$[$num_archives/$archives_multiple+1]; +done +# now make sure num_archives is an exact multiple of archives_multiple. +num_archives=$[$archives_multiple*$num_archives_intermediate] + echo $num_archives >$dir/info/num_archives echo $frames_per_eg >$dir/info/frames_per_eg - -# Working out number of egs per archive +# Work out the number of egs per archive egs_per_archive=$[$num_frames/($frames_per_eg*$num_archives)] ! [ $egs_per_archive -le $samples_per_iter ] && \ echo "$0: script error: egs_per_archive=$egs_per_archive not <= samples_per_iter=$samples_per_iter" \ @@ -206,38 +239,48 @@ echo $egs_per_archive > $dir/info/egs_per_archive echo "$0: creating $num_archives archives, each with $egs_per_archive egs, with" echo "$0: $frames_per_eg labels per example, and (left,right) context = ($left_context,$right_context)" -# Making soft links to storage directories. This is a no-up unless -# the subdirectory $dir/storage/ exists. See utils/create_split_dir.pl -for x in `seq $num_archives`; do - utils/create_data_link.pl $dir/egs.$x.ark - for y in `seq $nj`; do - utils/create_data_link.pl $dir/egs_orig.$x.$y.ark + + +if [ -e $dir/storage ]; then + # Make soft links to storage directories, if distributing this way.. See + # utils/create_split_dir.pl. + echo "$0: creating data links" + utils/create_data_link.pl $(for x in $(seq $num_archives); do echo $dir/egs.$x.ark; done) + for x in $(seq $num_archives_intermediate); do + utils/create_data_link.pl $(for y in $(seq $nj); do echo $dir/egs_orig.$y.$x.ark; done) done -done +fi + +if [ $stage -le 2 ]; then + echo "$0: copying data alignments" + for id in $(seq $num_ali_jobs); do gunzip -c $alidir/ali.$id.gz; done | \ + copy-int-vector ark:- ark,scp:$dir/ali.ark,$dir/ali.scp || exit 1; +fi egs_opts="--left-context=$left_context --right-context=$right_context --compress=$compress" +[ -z $valid_left_context ] && valid_left_context=$left_context; +[ -z $valid_right_context ] && valid_right_context=$right_context; +valid_egs_opts="--left-context=$valid_left_context --right-context=$valid_right_context --compress=$compress" + echo $left_context > $dir/info/left_context echo $right_context > $dir/info/right_context num_pdfs=$(tree-info --print-args=false $alidir/tree | grep num-pdfs | awk '{print $2}') -if [ $stage -le 2 ]; then +if [ $stage -le 3 ]; then echo "$0: Getting validation and training subset examples." rm $dir/.error 2>/dev/null echo "$0: ... extracting validation and training-subset alignments." - set -o pipefail; - for id in $(seq $nj); do gunzip -c $alidir/ali.$id.gz; done | \ - copy-int-vector ark:- ark,t:- | \ - utils/filter_scp.pl <(cat $dir/valid_uttlist $dir/train_subset_uttlist) | \ - gzip -c >$dir/ali_special.gz || exit 1; - set +o pipefail; # unset the pipefail option. + + utils/filter_scp.pl <(cat $dir/valid_uttlist $dir/train_subset_uttlist) \ + <$dir/ali.scp >$dir/ali_special.scp $cmd $dir/log/create_valid_subset.log \ - nnet3-get-egs --num-pdfs=$num_pdfs $valid_ivector_opt $egs_opts "$valid_feats" \ - "ark,s,cs:gunzip -c $dir/ali_special.gz | ali-to-pdf $alidir/final.mdl ark:- ark:- | ali-to-post ark:- ark:- |" \ - "ark:$dir/valid_all.egs" || touch $dir/.error & + nnet3-get-egs --num-pdfs=$num_pdfs $valid_ivector_opt $valid_egs_opts "$valid_feats" \ + "ark,s,cs:ali-to-pdf $alidir/final.mdl scp:$dir/ali_special.scp ark:- | ali-to-post ark:- ark:- |" \ + "ark:$dir/valid_all.egs" || touch $dir/.error & $cmd $dir/log/create_train_subset.log \ - nnet3-get-egs --num-pdfs=$num_pdfs $train_subset_ivector_opt $egs_opts "$train_subset_feats" \ - "ark,s,cs:gunzip -c $dir/ali_special.gz | ali-to-pdf $alidir/final.mdl ark:- ark:- | ali-to-post ark:- ark:- |" \ + nnet3-get-egs --num-pdfs=$num_pdfs $train_subset_ivector_opt $valid_egs_opts "$train_subset_feats" \ + "ark,s,cs:ali-to-pdf $alidir/final.mdl scp:$dir/ali_special.scp ark:- | ali-to-post ark:- ark:- |" \ "ark:$dir/train_subset_all.egs" || touch $dir/.error & wait; [ -f $dir/.error ] && echo "Error detected while creating train/valid egs" && exit 1 @@ -262,55 +305,76 @@ if [ $stage -le 2 ]; then for f in $dir/{combine,train_diagnostic,valid_diagnostic}.egs; do [ ! -s $f ] && echo "No examples in file $f" && exit 1; done - rm $dir/valid_all.egs $dir/train_subset_all.egs $dir/{train,valid}_combine.egs $dir/ali_special.gz + rm $dir/valid_all.egs $dir/train_subset_all.egs $dir/{train,valid}_combine.egs fi -if [ $stage -le 3 ]; then - # create egs_orig.*.*.ark; the first index goes to $num_archives, - # the second to $nj (which is the number of jobs in the original alignment - # dir) +if [ $stage -le 4 ]; then + # create egs_orig.*.*.ark; the first index goes to $nj, + # the second to $num_archives_intermediate. egs_list= - for n in $(seq $num_archives); do - egs_list="$egs_list ark:$dir/egs_orig.$n.JOB.ark" + for n in $(seq $num_archives_intermediate); do + egs_list="$egs_list ark:$dir/egs_orig.JOB.$n.ark" done echo "$0: Generating training examples on disk" - # The examples will go round-robin to egs_list. - if [ ! -z $post_dir ]; then - $cmd $io_opts JOB=1:$nj $dir/log/get_egs.JOB.log \ - nnet3-get-egs --num-pdfs=$num_pdfs $ivector_opt $egs_opts --num-frames=$frames_per_eg "$feats" \ - scp:$post_dir/post.JOB.scp ark:- \| \ - nnet3-copy-egs ark:- $egs_list || exit 1; - else - $cmd $io_opts JOB=1:$nj $dir/log/get_egs.JOB.log \ - nnet3-get-egs --num-pdfs=$num_pdfs $ivector_opt $egs_opts --num-frames=$frames_per_eg "$feats" \ - "ark,s,cs:gunzip -c $alidir/ali.JOB.gz | ali-to-pdf $alidir/final.mdl ark:- ark:- | ali-to-post ark:- ark:- |" ark:- \| \ - nnet3-copy-egs ark:- $egs_list || exit 1; - fi + # The examples will go round-robin to egs_list. + $cmd JOB=1:$nj $dir/log/get_egs.JOB.log \ + nnet3-get-egs --num-pdfs=$num_pdfs $ivector_opt $egs_opts --num-frames=$frames_per_eg "$feats" \ + "ark,s,cs:filter_scp.pl $sdata/JOB/utt2spk $dir/ali.scp | ali-to-pdf $alidir/final.mdl scp:- ark:- | ali-to-post ark:- ark:- |" ark:- \| \ + nnet3-copy-egs --random=true --srand=JOB ark:- $egs_list || exit 1; fi -if [ $stage -le 4 ]; then + +if [ $stage -le 5 ]; then echo "$0: recombining and shuffling order of archives on disk" - # combine all the "egs_orig.JOB.*.scp" (over the $nj splits of the data) and + # combine all the "egs_orig.*.JOB.scp" (over the $nj splits of the data) and # shuffle the order, writing to the egs.JOB.ark + # the input is a concatenation over the input jobs. egs_list= - for n in $(seq $nj); do - egs_list="$egs_list $dir/egs_orig.JOB.$n.ark" + for n in $(seq $nj); do + egs_list="$egs_list $dir/egs_orig.$n.JOB.ark" done - $cmd $io_opts $extra_opts JOB=1:$num_archives $dir/log/shuffle.JOB.log \ - nnet3-shuffle-egs --srand=JOB "ark:cat $egs_list|" ark:$dir/egs.JOB.ark || exit 1; + if [ $archives_multiple == 1 ]; then # normal case. + $cmd --max-jobs-run $nj JOB=1:$num_archives_intermediate $dir/log/shuffle.JOB.log \ + nnet3-shuffle-egs --srand=JOB "ark:cat $egs_list|" ark:$dir/egs.JOB.ark || exit 1; + else + # we need to shuffle the 'intermediate archives' and then split into the + # final archives. we create soft links to manage this splitting, because + # otherwise managing the output names is quite difficult (and we don't want + # to submit separate queue jobs for each intermediate archive, because then + # the --max-jobs-run option is hard to enforce). + output_archives="$(for y in $(seq $archives_multiple); do echo ark:$dir/egs.JOB.$y.ark; done)" + for x in $(seq $num_archives_intermediate); do + for y in $(seq $archives_multiple); do + archive_index=$[($x-1)*$archives_multiple+$y] + # egs.intermediate_archive.{1,2,...}.ark will point to egs.archive.ark + ln -sf egs.$archive_index.ark $dir/egs.$x.$y.ark || exit 1 + done + done + $cmd --max-jobs-run $nj JOB=1:$num_archives_intermediate $dir/log/shuffle.JOB.log \ + nnet3-shuffle-egs --srand=JOB "ark:cat $egs_list|" ark:- \| \ + nnet3-copy-egs ark:- $output_archives || exit 1; + fi + fi -if [ $stage -le 5 ]; then +if [ $stage -le 6 ]; then echo "$0: removing temporary archives" - for x in `seq $num_archives`; do - for y in `seq $nj`; do + for x in $(seq $nj); do + for y in $(seq $num_archives_intermediate); do file=$dir/egs_orig.$x.$y.ark [ -L $file ] && rm $(readlink -f $file) rm $file done done + if [ $archives_multiple -gt 1 ]; then + # there are some extra soft links that we should delete. + for f in $dir/egs.*.*.ark; do rm $f; done + fi + echo "$0: removing temporary alignments and transforms" + # Ignore errors below because trans.* might not exist. + rm $dir/{ali,trans}.{ark,scp} 2>/dev/null fi echo "$0: Finished preparing training examples" diff --git a/egs/wsj/s5/steps/nnet3/get_egs_discriminative.sh b/egs/wsj/s5/steps/nnet3/get_egs_discriminative.sh new file mode 100755 index 00000000000..c72bbc073ab --- /dev/null +++ b/egs/wsj/s5/steps/nnet3/get_egs_discriminative.sh @@ -0,0 +1,457 @@ +#!/bin/bash + +# Copyright 2012 Johns Hopkins University (Author: Daniel Povey). Apache 2.0. +# Copyright 2014-2015 Vimal Manohar + +# This script dumps examples MPE or MMI or state-level minimum bayes risk (sMBR) +# training of neural nets. +# Criterion supported are mpe, smbr and mmi + +# Begin configuration section. +cmd=run.pl +feat_type=raw # set it to 'lda' to use LDA features. +frames_per_eg=150 # number of frames of labels per example. more->less disk space and + # less time preparing egs, but more I/O during training. + # note: the script may reduce this if reduce_frames_per_eg is true. +frames_overlap_per_eg=30 # number of supervised frames of overlap that we aim for per eg. + # can be useful to avoid wasted data if you're using --left-deriv-truncate + # and --right-deriv-truncate. +frame_subsampling_factor=1 # ratio between input and output frame-rate of nnet. + # this should be read from the nnet. For now, it is taken as an option +left_context=4 # amount of left-context per eg (i.e. extra frames of input features + # not present in the output supervision). +right_context=4 # amount of right-context per eg. +valid_left_context= # amount of left_context for validation egs, typically used in + # recurrent architectures to ensure matched condition with + # training egs +valid_right_context= # amount of right_context for validation egs +adjust_priors=true +priors_left_context= # amount of left_context for priors egs +priors_right_context= # amount of right_context for priors egs +compress=true # set this to false to disable compression (e.g. if you want to see whether + # results are affected). +num_utts_subset=80 # number of utterances in validation and training + # subsets used for shrinkage and diagnostics. + +frames_per_iter=400000 # each iteration of training, see this many frames + # per job. This is just a guideline; it will pick a number + # that divides the number of samples in the entire data. + +determinize=true +minimize=true +remove_output_symbols=true +remove_epsilons=true +collapse_transition_ids=true +acwt=0.1 + +stage=0 +max_jobs_run=15 +max_shuffle_jobs_run=15 + +transform_dir= # If this is a SAT system, directory for transforms +online_ivector_dir= +cmvn_opts= # can be used for specifying CMVN options, if feature type is not lda (if lda, + # it doesn't make sense to use different options than were used as input to the + # LDA transform). This is used to turn off CMVN in the online-nnet experiments. + +num_priors_subset=100 +num_archives_priors=10 + +# End configuration section. + + +echo "$0 $@" # Print the command line for logging + +if [ -f path.sh ]; then . ./path.sh; fi +. parse_options.sh || exit 1; + + +if [ $# != 6 ]; then + echo "Usage: $0 [opts] " + echo " e.g.: $0 data/train data/lang exp/tri3_ali exp/tri4_nnet_denlats exp/tri4/final.mdl exp/tri4_mpe/degs" + echo "" + echo "Main options (for others, see top of script file)" + echo " --config # config file containing options" + echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs (probably would be good to add -tc 5 or so if using" + echo " # GridEngine (to avoid excessive NFS traffic)." + echo " --samples-per-iter <#samples|400000> # Number of samples of data to process per iteration, per" + echo " # process." + echo " --stage # Used to run a partially-completed training process from somewhere in" + echo " # the middle." + echo " --online-ivector-dir # Directory for online-estimated iVectors, used in the" + echo " # online-neural-net setup." + exit 1; +fi + +data=$1 +lang=$2 +alidir=$3 +denlatdir=$4 +src_model=$5 +dir=$6 + +extra_files= +[ ! -z $online_ivector_dir ] && \ + extra_files="$online_ivector_dir/ivector_period $online_ivector_dir/ivector_online.scp" + +# Check some files. +for f in $data/feats.scp $lang/L.fst $alidir/ali.1.gz $alidir/num_jobs $alidir/tree \ + $denlatdir/lat.1.gz $denlatdir/num_jobs $src_model $extra_files; do + [ ! -f $f ] && echo "$0: no such file $f" && exit 1; +done + +mkdir -p $dir/log $dir/info || exit 1; + +[ "$(readlink /bin/sh)" == dash ] && \ + echo "This script won't work if /bin/sh points to dash. make it point to bash." && exit 1 + +nj=$(cat $denlatdir/num_jobs) || exit 1; + +sdata=$data/split$nj +utils/split_data.sh $data $nj + +# Get list of validation utterances. +awk '{print $1}' $data/utt2spk | utils/shuffle_list.pl | head -$num_utts_subset \ + > $dir/valid_uttlist || exit 1; + +if [ -f $data/utt2uniq ]; then # this matters if you use data augmentation. + echo "File $data/utt2uniq exists, so augmenting valid_uttlist to" + echo "include all perturbed versions of the same 'real' utterances." + mv $dir/valid_uttlist $dir/valid_uttlist.tmp + utils/utt2spk_to_spk2utt.pl $data/utt2uniq > $dir/uniq2utt + cat $dir/valid_uttlist.tmp | utils/apply_map.pl $data/utt2uniq | \ + sort | uniq | utils/apply_map.pl $dir/uniq2utt | \ + awk '{for(n=1;n<=NF;n++) print $n;}' | sort > $dir/valid_uttlist + rm $dir/uniq2utt $dir/valid_uttlist.tmp +fi + +awk '{print $1}' $data/utt2spk | utils/filter_scp.pl --exclude $dir/valid_uttlist | \ + utils/shuffle_list.pl | head -$num_utts_subset > $dir/train_subset_uttlist || exit 1; + +[ -z "$transform_dir" ] && transform_dir=$alidir + +if [ $stage -le 1 ]; then + nj_ali=$(cat $alidir/num_jobs) + all_ids=$(seq -s, $nj_ali) + $cmd $dir/log/copy_alignments.log \ + copy-int-vector "ark:gunzip -c $alidir/ali.{$all_ids}.gz|" \ + ark,scp:$dir/ali.ark,$dir/ali.scp || exit 1; +fi + +prior_ali_rspecifier="ark,s,cs:utils/filter_scp.pl $dir/priors_uttlist $dir/ali.scp | ali-to-pdf $alidir/final.mdl scp:- ark:- |" + +if [ -f $transform_dir/trans.1 ] && [ $feat_type != "raw" ]; then + echo "$0: using transforms from $transform_dir" + if [ $stage -le 0 ]; then + $cmd $dir/log/copy_transforms.log \ + copy-feats "ark:cat $transform_dir/trans.* |" "ark,scp:$dir/trans.ark,$dir/trans.scp" + fi +fi +if [ -f $transform_dir/raw_trans.1 ] && [ $feat_type == "raw" ]; then + echo "$0: using raw transforms from $transform_dir" + if [ $stage -le 0 ]; then + $cmd $dir/log/copy_transforms.log \ + copy-feats "ark:cat $transform_dir/raw_trans.* |" "ark,scp:$dir/trans.ark,$dir/trans.scp" + fi +fi + +silphonelist=`cat $lang/phones/silence.csl` || exit 1; +cp $alidir/tree $dir +cp $lang/phones/silence.csl $dir/info/ +cp $src_model $dir/final.mdl || exit 1 + +# Get list of utterances for prior computation. +awk '{print $1}' $data/utt2spk | utils/filter_scp.pl --exclude $dir/valid_uttlist | \ + utils/shuffle_list.pl | head -$num_priors_subset \ + > $dir/priors_uttlist || exit 1; + +## We don't support deltas here, only LDA or raw (mainly because deltas are less +## frequently used). +if [ -z $feat_type ]; then + if [ -f $alidir/final.mat ] && [ ! -f $transform_dir/raw_trans.1 ]; then feat_type=lda; else feat_type=raw; fi +fi +echo "$0: feature type is $feat_type" + +case $feat_type in + raw) feats="ark,s,cs:utils/filter_scp.pl --exclude $dir/valid_uttlist $sdata/JOB/feats.scp | apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:- ark:- |" + valid_feats="ark,s,cs:utils/filter_scp.pl $dir/valid_uttlist $data/feats.scp | apply-cmvn $cmvn_opts --utt2spk=ark:$data/utt2spk scp:$data/cmvn.scp scp:- ark:- |" + train_subset_feats="ark,s,cs:utils/filter_scp.pl $dir/train_subset_uttlist $data/feats.scp | apply-cmvn $cmvn_opts --utt2spk=ark:$data/utt2spk scp:$data/cmvn.scp scp:- ark:- |" + priors_feats="ark,s,cs:utils/filter_scp.pl $dir/priors_uttlist $data/feats.scp | apply-cmvn $cmvn_opts --utt2spk=ark:$data/utt2spk scp:$data/cmvn.scp scp:- ark:- |" + echo $cmvn_opts > $dir/cmvn_opts + ;; + lda) + splice_opts=`cat $alidir/splice_opts 2>/dev/null` + cp $alidir/splice_opts $dir 2>/dev/null + cp $alidir/final.mat $dir + [ ! -z "$cmvn_opts" ] && \ + echo "You cannot supply --cmvn-opts option if feature type is LDA." && exit 1; + cmvn_opts=`cat $alidir/cmvn_opts 2>/dev/null` + cp $alidir/cmvn_opts $dir 2>/dev/null + feats="ark,s,cs:utils/filter_scp.pl --exclude $dir/valid_uttlist $sdata/JOB/feats.scp | apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:- ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $dir/final.mat ark:- ark:- |" + valid_feats="ark,s,cs:utils/filter_scp.pl $dir/valid_uttlist $data/feats.scp | apply-cmvn $cmvn_opts --utt2spk=ark:$data/utt2spk scp:$data/cmvn.scp scp:- ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $dir/final.mat ark:- ark:- |" + train_subset_feats="ark,s,cs:utils/filter_scp.pl $dir/train_subset_uttlist $data/feats.scp | apply-cmvn $cmvn_opts --utt2spk=ark:$data/utt2spk scp:$data/cmvn.scp scp:- ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $dir/final.mat ark:- ark:- |" + priors_feats="ark,s,cs:utils/filter_scp.pl $dir/priors_uttlist $data/feats.scp | apply-cmvn $cmvn_opts --utt2spk=ark:$data/utt2spk scp:$data/cmvn.scp scp:- ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $dir/final.mat ark:- ark:- |" + ;; + *) echo "$0: invalid feature type $feat_type" && exit 1; +esac + +if [ -f $dir/trans.scp ]; then + feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk scp:$dir/trans.scp ark:- ark:- |" + valid_feats="$valid_feats transform-feats --utt2spk=ark:$data/utt2spk scp:$dir/trans.scp|' ark:- ark:- |" + train_subset_feats="$train_subset_feats transform-feats --utt2spk=ark:$data/utt2spk scp:$dir/trans.scp|' ark:- ark:- |" + priors_feats="$priors_feats transform-feats --utt2spk=ark:$data/utt2spk scp:$dir/trans.scp|' ark:- ark:- |" +fi + +if [ ! -z $online_ivector_dir ]; then + ivector_period=$(cat $online_ivector_dir/ivector_period) + ivector_dim=$(feat-to-dim scp:$online_ivector_dir/ivector_online.scp -) || exit 1; + echo $ivector_dim >$dir/info/ivector_dim + + ivector_opt="--ivectors='ark,s,cs:utils/filter_scp.pl $sdata/JOB/utt2spk $online_ivector_dir/ivector_online.scp | subsample-feats --n=-$ivector_period scp:- ark:- |'" + valid_ivector_opt="--ivectors='ark,s,cs:utils/filter_scp.pl $dir/valid_uttlist $online_ivector_dir/ivector_online.scp | subsample-feats --n=-$ivector_period scp:- ark:- |'" + train_subset_ivector_opt="--ivectors='ark,s,cs:utils/filter_scp.pl $dir/train_subset_uttlist $online_ivector_dir/ivector_online.scp | subsample-feats --n=-$ivector_period scp:- ark:- |'" + priors_ivector_opt="--ivectors='ark,s,cs:utils/filter_scp.pl $dir/priors_uttlist $online_ivector_dir/ivector_online.scp | subsample-feats --n=-$ivector_period scp:- ark:- |'" +fi + +if [ $stage -le 2 ]; then + echo "$0: working out number of frames of training data" + num_frames=$(steps/nnet2/get_num_frames.sh $data) + echo $num_frames > $dir/info/num_frames + echo "$0: working out feature dim" + feats_one="$(echo $feats | sed s/JOB/1/g)" + feat_dim=$(feat-to-dim "$feats_one" -) || exit 1; + echo $feat_dim > $dir/info/feat_dim +else + num_frames=$(cat $dir/info/num_frames) || exit 1; + feat_dim=$(cat $dir/info/feat_dim) || exit 1; +fi + +# Working out total number of archives. Add one on the assumption the +# num-frames won't divide exactly, and we want to round up. +num_archives=$[$num_frames/$frames_per_iter+1] + +# We may have to first create a smaller number of larger archives, with number +# $num_archives_intermediate, if $num_archives is more than the maximum number +# of open filehandles that the system allows per process (ulimit -n). +max_open_filehandles=$(ulimit -n) || exit 1 +num_archives_intermediate=$num_archives +archives_multiple=1 +while [ $[$num_archives_intermediate+4] -gt $max_open_filehandles ]; do + archives_multiple=$[$archives_multiple+1] + num_archives_intermediate=$[$num_archives/$archives_multiple] || exit 1; +done +# now make sure num_archives is an exact multiple of archives_multiple. +num_archives=$[$archives_multiple*$num_archives_intermediate] || exit 1; + +echo $num_archives >$dir/info/num_archives +echo $frames_per_eg >$dir/info/frames_per_eg +# Work out the number of egs per archive +egs_per_archive=$[$num_frames/($frames_per_eg*$num_archives)] || exit 1; +! [ $egs_per_archive -le $frames_per_iter ] && \ + echo "$0: script error: egs_per_archive=$egs_per_archive not <= frames_per_iter=$frames_per_iter" \ + && exit 1; + +echo $egs_per_archive > $dir/info/egs_per_archive + +echo "$0: creating $num_archives archives, each with $egs_per_archive egs, with" +echo "$0: $frames_per_eg labels per example, and (left,right) context = ($left_context,$right_context)" + + +if [ -e $dir/storage ]; then + # Make soft links to storage directories, if distributing this way.. See + # utils/create_split_dir.pl. + echo "$0: creating data links" + utils/create_data_link.pl $(for x in $(seq $num_archives); do echo $dir/degs.$x.ark; done) + for x in $(seq $num_archives_intermediate); do + utils/create_data_link.pl $(for y in $(seq $nj); do echo $dir/degs_orig.$y.$x.ark; done) + done +fi + +if [ $stage -le 3 ]; then + echo "$0: copying training lattices" + + $cmd --max-jobs-run 6 JOB=1:$nj $dir/log/lattice_copy.JOB.log \ + lattice-copy --write-compact=false --include="cat $dir/valid_uttlist $dir/train_subset_uttlist |" --ignore-missing \ + "ark:gunzip -c $denlatdir/lat.JOB.gz|" ark,scp:$dir/lat_special.JOB.ark,$dir/lat_special.JOB.scp || exit 1; + + for id in $(seq $nj); do cat $dir/lat_special.$id.scp; done > $dir/lat_special.scp +fi + +splitter_opts="--supervision-splitter.determinize=$determinize --supervision-splitter.minimize=$minimize --supervision-splitter.remove_output_symbols=$remove_output_symbols --supervision-splitter.remove_epsilons=$remove_epsilons --supervision-splitter.collapse-transition-ids=$collapse_transition_ids --supervision-splitter.acoustic-scale=$acwt" + +[ -z $valid_left_context ] && valid_left_context=$left_context; +[ -z $valid_right_context ] && valid_right_context=$right_context; + +[ -z $priors_left_context ] && priors_left_context=$left_context; +[ -z $priors_right_context ] && priors_right_context=$right_context; + +left_context=$[left_context+frame_subsampling_factor/2] +right_context=$[right_context+frame_subsampling_factor/2] + +egs_opts="--left-context=$left_context --right-context=$right_context --num-frames=$frames_per_eg --num-frames-overlap=$frames_overlap_per_eg --frame-subsampling-factor=$frame_subsampling_factor --compress=$compress $splitter_opts" + +valid_left_context=$[valid_left_context+frame_subsampling_factor/2] +valid_right_context=$[valid_right_context+frame_subsampling_factor/2] + +# don't do the overlap thing for the validation data. +valid_egs_opts="--left-context=$valid_left_context --right-context=$valid_right_context --num-frames=$frames_per_eg --frame-subsampling-factor=$frame_subsampling_factor --compress=$compress $splitter_opts" + +priors_left_context=$[priors_left_context+frame_subsampling_factor/2] +priors_right_context=$[priors_right_context+frame_subsampling_factor/2] + +# don't do the overlap thing for the priors computation data. +priors_egs_opts="--left-context=$priors_left_context --right-context=$priors_right_context --num-frames=1 --compress=$compress" + +supervision_all_opts="--frame-subsampling-factor=$frame_subsampling_factor" + +echo $left_context > $dir/info/left_context +echo $right_context > $dir/info/right_context + +echo $priors_left_context > $dir/info/priors_left_context +echo $priors_right_context > $dir/info/priors_right_context + +echo $frame_subsampling_factor > $dir/info/frame_subsampling_factor + +( + if $adjust_priors && [ $stage -le 10 ]; then + if [ ! -f $dir/ali.scp ]; then + nj_ali=$(cat $alidir/num_jobs) + all_ids=$(seq -s, $nj_ali) + $cmd $dir/log/copy_alignments.log \ + copy-int-vector "ark:gunzip -c $alidir/ali.{$all_ids}.gz|" \ + ark,scp:$dir/ali.ark,$dir/ali.scp || exit 1; + fi + + priors_egs_list= + for y in `seq $num_archives_priors`; do + utils/create_data_link.pl $dir/priors_egs.$y.ark + priors_egs_list="$priors_egs_list ark:$dir/priors_egs.$y.ark" + done + + echo "$0: dumping egs for prior adjustment in the background." + + num_pdfs=`am-info $alidir/final.mdl | grep pdfs | awk '{print $NF}' 2>/dev/null` || exit 1 + + $cmd $dir/log/create_priors_subset.log \ + nnet3-get-egs --num-pdfs=$num_pdfs $priors_ivector_opt $priors_egs_opts "$priors_feats" \ + "$prior_ali_rspecifier ali-to-post ark:- ark:- |" \ + ark:- \| nnet3-copy-egs ark:- $priors_egs_list || \ + { touch $dir/.error; echo "Error in creating priors subset. See $dir/log/create_priors_subset.log"; exit 1; } + + sleep 3; + + echo $num_archives_priors >$dir/info/num_archives_priors + else + echo 0 > $dir/info/num_archives_priors + fi +) & + +if [ $stage -le 4 ]; then + echo "$0: Getting validation and training subset examples." + rm $dir/.error 2>/dev/null + echo "$0: ... extracting validation and training-subset alignments." + + #utils/filter_scp.pl <(cat $dir/valid_uttlist $dir/train_subset_uttlist) \ + # <$dir/lat.scp >$dir/lat_special.scp + + utils/filter_scp.pl <(cat $dir/valid_uttlist $dir/train_subset_uttlist) \ + <$dir/ali.scp >$dir/ali_special.scp + + $cmd $dir/log/create_valid_subset.log \ + discriminative-get-supervision $supervision_all_opts \ + scp:$dir/ali_special.scp scp:$dir/lat_special.scp ark:- \| \ + nnet3-discriminative-get-egs $valid_ivector_opt $valid_egs_opts \ + $dir/final.mdl "$valid_feats" ark,s,cs:- "ark:$dir/valid_diagnostic.degs" || touch $dir/.error & + + $cmd $dir/log/create_train_subset.log \ + discriminative-get-supervision $supervision_all_opts \ + scp:$dir/ali_special.scp scp:$dir/lat_special.scp ark:- \| \ + nnet3-discriminative-get-egs $train_subset_ivector_opt $egs_opts \ + $dir/final.mdl "$train_subset_feats" ark,s,cs:- "ark:$dir/train_diagnostic.degs" || touch $dir/.error & + wait; + [ -f $dir/.error ] && echo "Error detected while creating train/valid egs" && exit 1 + echo "... Getting subsets of validation examples for diagnostics and combination." + + for f in $dir/{train_diagnostic,valid_diagnostic}.degs; do + [ ! -s $f ] && echo "No examples in file $f" && exit 1; + done +fi + +if [ $stage -le 5 ]; then + # create degs_orig.*.*.ark; the first index goes to $nj, + # the second to $num_archives_intermediate. + + degs_list= + for n in $(seq $num_archives_intermediate); do + degs_list="$degs_list ark:$dir/degs_orig.JOB.$n.ark" + done + echo "$0: Generating training examples on disk" + + # The examples will go round-robin to degs_list. + # To make it efficient we need to use a large 'nj', like 40, and in that case + # there can be too many small files to deal with, because the total number of + # files is the product of 'nj' by 'num_archives_intermediate', which might be + # quite large. + $cmd --max-jobs-run $max_jobs_run JOB=1:$nj $dir/log/get_egs.JOB.log \ + discriminative-get-supervision $supervision_all_opts \ + "scp:utils/filter_scp.pl $sdata/JOB/utt2spk $dir/ali.scp |" \ + "ark:gunzip -c $denlatdir/lat.JOB.gz |" ark:- \| \ + nnet3-discriminative-get-egs $ivector_opt $egs_opts \ + $dir/final.mdl "$feats" ark,s,cs:- ark:- \| \ + nnet3-discriminative-copy-egs --random=true --srand=JOB ark:- $degs_list || exit 1; +fi + +if [ $stage -le 6 ]; then + echo "$0: recombining and shuffling order of archives on disk" + # combine all the "degs_orig.*.JOB.scp" (over the $nj splits of the data) and + # shuffle the order, writing to the degs.JOB.ark + + # the input is a concatenation over the input jobs. + degs_list= + for n in $(seq $nj); do + degs_list="$degs_list $dir/degs_orig.$n.JOB.ark" + done + + if [ $archives_multiple == 1 ]; then # normal case. + $cmd --max-jobs-run $max_shuffle_jobs_run --mem 8G JOB=1:$num_archives_intermediate $dir/log/shuffle.JOB.log \ + nnet3-discriminative-shuffle-egs --srand=JOB "ark:cat $degs_list|" ark:$dir/degs.JOB.ark || exit 1; + else + # we need to shuffle the 'intermediate archives' and then split into the + # final archives. we create soft links to manage this splitting, because + # otherwise managing the output names is quite difficult (and we don't want + # to submit separate queue jobs for each intermediate archive, because then + # the --max-jobs-run option is hard to enforce). + output_archives="$(for y in $(seq $archives_multiple); do echo ark:$dir/degs.JOB.$y.ark; done)" + for x in $(seq $num_archives_intermediate); do + for y in $(seq $archives_multiple); do + archive_index=$[($x-1)*$archives_multiple+$y] + # degs.intermediate_archive.{1,2,...}.ark will point to degs.archive.ark + ln -sf degs.$archive_index.ark $dir/degs.$x.$y.ark || exit 1 + done + done + $cmd --max-jobs-run $max_shuffle_jobs_run --mem 8G JOB=1:$num_archives_intermediate $dir/log/shuffle.JOB.log \ + nnet3-discriminative-shuffle-egs --srand=JOB "ark:cat $degs_list|" ark:- \| \ + nnet3-discriminative-copy-egs ark:- $output_archives || exit 1; + fi +fi + +if [ $stage -le 7 ]; then + echo "$0: removing temporary archives" + ( + cd $dir + for f in $(ls -l . | grep 'degs_orig' | awk '{ X=NF-1; Y=NF-2; if ($X == "->") print $Y, $NF; }'); do rm $f; done + ) + if [ $archives_multiple -gt 1 ]; then + # there are some extra soft links that we should delete. + for f in $dir/degs.*.*.ark; do rm $f; done + fi + echo "$0: removing temporary lattices" + rm $dir/lat.* + echo "$0: removing temporary alignments and transforms" + # Ignore errors below because trans.* might not exist. + rm $dir/{ali,trans}.{ark,scp} 2>/dev/null +fi + +wait + +echo "$0: Finished preparing training examples" diff --git a/egs/wsj/s5/steps/nnet3/get_egs_targets.sh b/egs/wsj/s5/steps/nnet3/get_egs_targets.sh new file mode 100755 index 00000000000..7fbc24858b5 --- /dev/null +++ b/egs/wsj/s5/steps/nnet3/get_egs_targets.sh @@ -0,0 +1,409 @@ +#!/bin/bash + +# Copyright 2012-2015 Johns Hopkins University (Author: Daniel Povey). +# 2015-2016 Vimal Manohar +# Apache 2.0. + +# This script is similar to steps/nnet3/get_egs.sh but used +# when getting general targets (not from alignment directory) for raw nnet +# +# This script, which will generally be called from other neural-net training +# scripts, extracts the training examples used to train the neural net (and also +# the validation examples used for diagnostics), and puts them in separate archives. +# +# This script dumps egs with several frames of labels, controlled by the +# frames_per_eg config variable (default: 8). This takes many times less disk +# space because typically we have 4 to 7 frames of context on the left and +# right, and this ends up getting shared. This is at the expense of slightly +# higher disk I/O while training. + + +# Begin configuration section. +cmd=run.pl +feat_type=raw # set it to 'lda' to use LDA features. +target_type=sparse # dense to have dense targets, + # sparse to have posteriors targets +num_targets= # required for target-type=sparse with raw nnet +frames_per_eg=8 # number of frames of labels per example. more->less disk space and + # less time preparing egs, but more I/O during training. + # note: the script may reduce this if reduce_frames_per_eg is true. +left_context=4 # amount of left-context per eg (i.e. extra frames of input features + # not present in the output supervision). +right_context=4 # amount of right-context per eg. +valid_left_context= # amount of left_context for validation egs, typically used in + # recurrent architectures to ensure matched condition with + # training egs +valid_right_context= # amount of right_context for validation egs +compress=true # set this to false to disable compression (e.g. if you want to see whether + # results are affected). + +reduce_frames_per_eg=true # If true, this script may reduce the frames_per_eg + # if there is only one archive and even with the + # reduced frames_per_eg, the number of + # samples_per_iter that would result is less than or + # equal to the user-specified value. +num_utts_subset=300 # number of utterances in validation and training + # subsets used for shrinkage and diagnostics. +num_valid_frames_combine=0 # #valid frames for combination weights at the very end. +num_train_frames_combine=10000 # # train frames for the above. +num_frames_diagnostic=4000 # number of frames for "compute_prob" jobs +samples_per_iter=400000 # this is the target number of egs in each archive of egs + # (prior to merging egs). We probably should have called + # it egs_per_iter. This is just a guideline; it will pick + # a number that divides the number of samples in the + # entire data. + +transform_dir= + +stage=0 +nj=6 # This should be set to the maximum number of jobs you are + # comfortable to run in parallel; you can increase it if your disk + # speed is greater and you have more machines. +online_ivector_dir= # can be used if we are including speaker information as iVectors. +cmvn_opts= # can be used for specifying CMVN options, if feature type is not lda (if lda, + # it doesn't make sense to use different options than were used as input to the + # LDA transform). This is used to turn off CMVN in the online-nnet experiments. + +echo "$0 $@" # Print the command line for logging + +if [ -f path.sh ]; then . ./path.sh; fi +. parse_options.sh || exit 1; + +if [ $# != 3 ]; then + echo "Usage: $0 [opts] " + echo " e.g.: $0 data/train data/train/snr_targets.scp exp/tri4_nnet/egs" + echo "" + echo "Main options (for others, see top of script file)" + echo " --config # config file containing options" + echo " --nj # The maximum number of jobs you want to run in" + echo " # parallel (increase this only if you have good disk and" + echo " # network speed). default=6" + echo " --cmd (utils/run.pl;utils/queue.pl ) # how to run jobs." + echo " --samples-per-iter <#samples;400000> # Target number of egs per archive (option is badly named)" + echo " --feat-type # (raw is the default). The feature type you want" + echo " # to use as input to the neural net." + echo " --frames-per-eg # number of frames per eg on disk" + echo " --left-context # Number of frames on left side to append for feature input" + echo " --right-context # Number of frames on right side to append for feature input" + echo " --num-frames-diagnostic <#frames;4000> # Number of frames used in computing (train,valid) diagnostics" + echo " --num-valid-frames-combine <#frames;10000> # Number of frames used in getting combination weights at the" + echo " # very end." + echo " --stage # Used to run a partially-completed training process from somewhere in" + echo " # the middle." + + exit 1; +fi + +data=$1 +targets_scp=$2 +dir=$3 + +# Check some files. +[ ! -z "$online_ivector_dir" ] && \ + extra_files="$online_ivector_dir/ivector_online.scp $online_ivector_dir/ivector_period" + +for f in $data/feats.scp $targets_scp $extra_files; do + [ ! -f $f ] && echo "$0: no such file $f" && exit 1; +done + +sdata=$data/split$nj +utils/split_data.sh $data $nj + +mkdir -p $dir/log $dir/info + + +# Get list of validation utterances. +awk '{print $1}' $data/utt2spk | utils/shuffle_list.pl | head -$num_utts_subset | sort \ + > $dir/valid_uttlist || exit 1; + +if [ -f $data/utt2uniq ]; then # this matters if you use data augmentation. + echo "File $data/utt2uniq exists, so augmenting valid_uttlist to" + echo "include all perturbed versions of the same 'real' utterances." + mv $dir/valid_uttlist $dir/valid_uttlist.tmp + utils/utt2spk_to_spk2utt.pl $data/utt2uniq > $dir/uniq2utt + cat $dir/valid_uttlist.tmp | utils/apply_map.pl $data/utt2uniq | \ + sort | uniq | utils/apply_map.pl $dir/uniq2utt | \ + awk '{for(n=1;n<=NF;n++) print $n;}' | sort > $dir/valid_uttlist + rm $dir/uniq2utt $dir/valid_uttlist.tmp +fi + +awk '{print $1}' $data/utt2spk | utils/filter_scp.pl --exclude $dir/valid_uttlist | \ + utils/shuffle_list.pl | head -$num_utts_subset | sort > $dir/train_subset_uttlist || exit 1; + +if [ ! -z "$transform_dir" ] && [ -f $transform_dir/trans.1 ] && [ $feat_type != "raw" ]; then + echo "$0: using transforms from $transform_dir" + if [ $stage -le 0 ]; then + $cmd $dir/log/copy_transforms.log \ + copy-feats "ark:cat $transform_dir/trans.* |" "ark,scp:$dir/trans.ark,$dir/trans.scp" + fi +fi +if [ -f $transform_dir/raw_trans.1 ] && [ $feat_type == "raw" ]; then + echo "$0: using raw transforms from $transform_dir" + if [ $stage -le 0 ]; then + $cmd $dir/log/copy_transforms.log \ + copy-feats "ark:cat $transform_dir/raw_trans.* |" "ark,scp:$dir/trans.ark,$dir/trans.scp" + fi +fi + + + +## Set up features. +echo "$0: feature type is $feat_type" + +case $feat_type in + raw) feats="ark,s,cs:utils/filter_scp.pl --exclude $dir/valid_uttlist $sdata/JOB/feats.scp | apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:- ark:- |" + valid_feats="ark,s,cs:utils/filter_scp.pl $dir/valid_uttlist $data/feats.scp | apply-cmvn $cmvn_opts --utt2spk=ark:$data/utt2spk scp:$data/cmvn.scp scp:- ark:- |" + train_subset_feats="ark,s,cs:utils/filter_scp.pl $dir/train_subset_uttlist $data/feats.scp | apply-cmvn $cmvn_opts --utt2spk=ark:$data/utt2spk scp:$data/cmvn.scp scp:- ark:- |" + echo $cmvn_opts >$dir/cmvn_opts # caution: the top-level nnet training script should copy this to its own dir now. + ;; + lda) + splice_opts=`cat $transform_dir/splice_opts 2>/dev/null` + # caution: the top-level nnet training script should copy these to its own dir now. + cp $transform_dir/{splice_opts,cmvn_opts,final.mat} $dir || exit 1; + [ ! -z "$cmvn_opts" ] && \ + echo "You cannot supply --cmvn-opts option if feature type is LDA." && exit 1; + cmvn_opts=$(cat $dir/cmvn_opts) + feats="ark,s,cs:utils/filter_scp.pl --exclude $dir/valid_uttlist $sdata/JOB/feats.scp | apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:- ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $dir/final.mat ark:- ark:- |" + valid_feats="ark,s,cs:utils/filter_scp.pl $dir/valid_uttlist $data/feats.scp | apply-cmvn $cmvn_opts --utt2spk=ark:$data/utt2spk scp:$data/cmvn.scp scp:- ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $dir/final.mat ark:- ark:- |" + train_subset_feats="ark,s,cs:utils/filter_scp.pl $dir/train_subset_uttlist $data/feats.scp | apply-cmvn $cmvn_opts --utt2spk=ark:$data/utt2spk scp:$data/cmvn.scp scp:- ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $dir/final.mat ark:- ark:- |" + ;; + *) echo "$0: invalid feature type --feat-type '$feat_type'" && exit 1; +esac + +if [ -f $dir/trans.scp ]; then + feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk scp:$dir/trans.scp ark:- ark:- |" + valid_feats="$valid_feats transform-feats --utt2spk=ark:$data/utt2spk scp:$dir/trans.scp ark:- ark:- |" + train_subset_feats="$train_subset_feats transform-feats --utt2spk=ark:$data/utt2spk scp:$dir/trans.scp ark:- ark:- |" +fi + +if [ ! -z "$online_ivector_dir" ]; then + ivector_dim=$(feat-to-dim scp:$online_ivector_dir/ivector_online.scp -) || exit 1; + echo $ivector_dim > $dir/info/ivector_dim + ivector_period=$(cat $online_ivector_dir/ivector_period) || exit 1; + + ivector_opt="--ivectors='ark,s,cs:utils/filter_scp.pl $sdata/JOB/utt2spk $online_ivector_dir/ivector_online.scp | subsample-feats --n=-$ivector_period scp:- ark:- |'" + valid_ivector_opt="--ivectors='ark,s,cs:utils/filter_scp.pl $dir/valid_uttlist $online_ivector_dir/ivector_online.scp | subsample-feats --n=-$ivector_period scp:- ark:- |'" + train_subset_ivector_opt="--ivectors='ark,s,cs:utils/filter_scp.pl $dir/train_subset_uttlist $online_ivector_dir/ivector_online.scp | subsample-feats --n=-$ivector_period scp:- ark:- |'" +else + echo 0 >$dir/info/ivector_dim +fi + +if [ $stage -le 1 ]; then + echo "$0: working out number of frames of training data" + num_frames=$(steps/nnet2/get_num_frames.sh $data) + echo $num_frames > $dir/info/num_frames + echo "$0: working out feature dim" + feats_one="$(echo $feats | sed s:JOB:1:g)" + feat_dim=$(feat-to-dim "$feats_one" -) || exit 1; + echo $feat_dim > $dir/info/feat_dim +else + num_frames=$(cat $dir/info/num_frames) || exit 1; + feat_dim=$(cat $dir/info/feat_dim) || exit 1; +fi + +# the + 1 is to round up, not down... we assume it doesn't divide exactly. +num_archives=$[$num_frames/($frames_per_eg*$samples_per_iter)+1] +# (for small data)- while reduce_frames_per_eg == true and the number of +# archives is 1 and would still be 1 if we reduced frames_per_eg by 1, reduce it +# by 1. +reduced=false +while $reduce_frames_per_eg && [ $frames_per_eg -gt 1 ] && \ + [ $[$num_frames/(($frames_per_eg-1)*$samples_per_iter)] -eq 0 ]; do + frames_per_eg=$[$frames_per_eg-1] + num_archives=1 + reduced=true +done +$reduced && echo "$0: reduced frames_per_eg to $frames_per_eg because amount of data is small." + +# We may have to first create a smaller number of larger archives, with number +# $num_archives_intermediate, if $num_archives is more than the maximum number +# of open filehandles that the system allows per process (ulimit -n). +max_open_filehandles=$(ulimit -n) || exit 1 +num_archives_intermediate=$num_archives +archives_multiple=1 +while [ $[$num_archives_intermediate+4] -gt $max_open_filehandles ]; do + archives_multiple=$[$archives_multiple+1] + num_archives_intermediate=$[$num_archives/$archives_multiple+1]; +done +# now make sure num_archives is an exact multiple of archives_multiple. +num_archives=$[$archives_multiple*$num_archives_intermediate] + +echo $num_archives >$dir/info/num_archives +echo $frames_per_eg >$dir/info/frames_per_eg +# Work out the number of egs per archive +egs_per_archive=$[$num_frames/($frames_per_eg*$num_archives)] +! [ $egs_per_archive -le $samples_per_iter ] && \ + echo "$0: script error: egs_per_archive=$egs_per_archive not <= samples_per_iter=$samples_per_iter" \ + && exit 1; + +echo $egs_per_archive > $dir/info/egs_per_archive + +echo "$0: creating $num_archives archives, each with $egs_per_archive egs, with" +echo "$0: $frames_per_eg labels per example, and (left,right) context = ($left_context,$right_context)" + + + +if [ -e $dir/storage ]; then + # Make soft links to storage directories, if distributing this way.. See + # utils/create_split_dir.pl. + echo "$0: creating data links" + utils/create_data_link.pl $(for x in $(seq $num_archives); do echo $dir/egs.$x.ark; done) + for x in $(seq $num_archives_intermediate); do + utils/create_data_link.pl $(for y in $(seq $nj); do echo $dir/egs_orig.$y.$x.ark; done) + done +fi + +egs_opts="--left-context=$left_context --right-context=$right_context --compress=$compress" + +[ -z $valid_left_context ] && valid_left_context=$left_context; +[ -z $valid_right_context ] && valid_right_context=$right_context; +valid_egs_opts="--left-context=$valid_left_context --right-context=$valid_right_context --compress=$compress" + +echo $left_context > $dir/info/left_context +echo $right_context > $dir/info/right_context + +for n in `seq $nj`; do + utils/filter_scp.pl $sdata/$n/utt2spk $targets_scp > $dir/targets.$n.scp +done + +targets_scp_split=$dir/targets.JOB.scp + +if [ $target_type == "dense" ]; then + num_targets=$(feat-to-dim "scp:$targets_scp" - 2>/dev/null) || exit 1 +fi + +if [ -z "$num_targets" ]; then + echo "$0: num-targets is not set" + exit 1 +fi + +case $target_type in + "dense") + get_egs_program="nnet3-get-egs-dense-targets --num-targets=$num_targets" + + targets="ark:utils/filter_scp.pl --exclude $dir/valid_uttlist $targets_scp_split | copy-feats scp:- ark:- |" + valid_targets="ark:utils/filter_scp.pl $dir/valid_uttlist $targets_scp | copy-feats scp:- ark:- |" + train_subset_targets="ark:utils/filter_scp.pl $dir/train_subset_uttlist $targets_scp | copy-feats scp:- ark:- |" + ;; + "sparse") + get_egs_program="nnet3-get-egs --num-pdfs=$num_targets" + targets="ark:utils/filter_scp.pl --exclude $dir/valid_uttlist $targets_scp_split | ali-to-post scp:- ark:- |" + valid_targets="ark:utils/filter_scp.pl $dir/valid_uttlist $targets_scp | ali-to-post scp:- ark:- |" + train_subset_targets="ark:utils/filter_scp.pl $dir/train_subset_uttlist $targets_scp | ali-to-post scp:- ark:- |" + ;; + default) + echo "$0: Unknown --target-type $target_type. Choices are dense and sparse" + exit 1 +esac + +if [ $stage -le 3 ]; then + echo "$0: Getting validation and training subset examples." + rm -f $dir/.error 2>/dev/null + $cmd $dir/log/create_valid_subset.log \ + $get_egs_program \ + $valid_ivector_opt $valid_egs_opts "$valid_feats" \ + "$valid_targets" \ + "ark:$dir/valid_all.egs" || touch $dir/.error & + $cmd $dir/log/create_train_subset.log \ + $get_egs_program \ + $train_subset_ivector_opt $valid_egs_opts "$train_subset_feats" \ + "$train_subset_targets" \ + "ark:$dir/train_subset_all.egs" || touch $dir/.error & + wait; + [ -f $dir/.error ] && echo "Error detected while creating train/valid egs" && exit 1 + echo "... Getting subsets of validation examples for diagnostics and combination." + $cmd $dir/log/create_valid_subset_combine.log \ + nnet3-subset-egs --n=$num_valid_frames_combine ark:$dir/valid_all.egs \ + ark:$dir/valid_combine.egs || touch $dir/.error & + $cmd $dir/log/create_valid_subset_diagnostic.log \ + nnet3-subset-egs --n=$num_frames_diagnostic ark:$dir/valid_all.egs \ + ark:$dir/valid_diagnostic.egs || touch $dir/.error & + + $cmd $dir/log/create_train_subset_combine.log \ + nnet3-subset-egs --n=$num_train_frames_combine ark:$dir/train_subset_all.egs \ + ark:$dir/train_combine.egs || touch $dir/.error & + $cmd $dir/log/create_train_subset_diagnostic.log \ + nnet3-subset-egs --n=$num_frames_diagnostic ark:$dir/train_subset_all.egs \ + ark:$dir/train_diagnostic.egs || touch $dir/.error & + wait + sleep 5 # wait for file system to sync. + cat $dir/valid_combine.egs $dir/train_combine.egs > $dir/combine.egs + + for f in $dir/{combine,train_diagnostic,valid_diagnostic}.egs; do + [ ! -s $f ] && echo "No examples in file $f" && exit 1; + done + rm -f $dir/valid_all.egs $dir/train_subset_all.egs $dir/{train,valid}_combine.egs +fi + +if [ $stage -le 4 ]; then + # create egs_orig.*.*.ark; the first index goes to $nj, + # the second to $num_archives_intermediate. + + egs_list= + for n in $(seq $num_archives_intermediate); do + egs_list="$egs_list ark:$dir/egs_orig.JOB.$n.ark" + done + echo "$0: Generating training examples on disk" + # The examples will go round-robin to egs_list. + $cmd JOB=1:$nj $dir/log/get_egs.JOB.log \ + $get_egs_program \ + $ivector_opt $egs_opts --num-frames=$frames_per_eg "$feats" "$targets" \ + ark:- \| \ + nnet3-copy-egs --random=true --srand=JOB ark:- $egs_list || exit 1; +fi + +if [ $stage -le 5 ]; then + echo "$0: recombining and shuffling order of archives on disk" + # combine all the "egs_orig.*.JOB.scp" (over the $nj splits of the data) and + # shuffle the order, writing to the egs.JOB.ark + + # the input is a concatenation over the input jobs. + egs_list= + for n in $(seq $nj); do + egs_list="$egs_list $dir/egs_orig.$n.JOB.ark" + done + + if [ $archives_multiple == 1 ]; then # normal case. + $cmd --max-jobs-run $nj JOB=1:$num_archives_intermediate $dir/log/shuffle.JOB.log \ + nnet3-shuffle-egs --srand=JOB "ark:cat $egs_list|" ark:$dir/egs.JOB.ark || exit 1; + else + # we need to shuffle the 'intermediate archives' and then split into the + # final archives. we create soft links to manage this splitting, because + # otherwise managing the output names is quite difficult (and we don't want + # to submit separate queue jobs for each intermediate archive, because then + # the --max-jobs-run option is hard to enforce). + output_archives="$(for y in $(seq $archives_multiple); do echo ark:$dir/egs.JOB.$y.ark; done)" + for x in $(seq $num_archives_intermediate); do + for y in $(seq $archives_multiple); do + archive_index=$[($x-1)*$archives_multiple+$y] + # egs.intermediate_archive.{1,2,...}.ark will point to egs.archive.ark + ln -sf egs.$archive_index.ark $dir/egs.$x.$y.ark || exit 1 + done + done + $cmd --max-jobs-run $nj JOB=1:$num_archives_intermediate $dir/log/shuffle.JOB.log \ + nnet3-shuffle-egs --srand=JOB "ark:cat $egs_list|" ark:- \| \ + nnet3-copy-egs ark:- $output_archives || exit 1; + fi + +fi + +if [ $stage -le 6 ]; then + echo "$0: removing temporary archives" + for x in $(seq $nj); do + for y in $(seq $num_archives_intermediate); do + file=$dir/egs_orig.$x.$y.ark + [ -L $file ] && rm $(readlink -f $file) + rm $file + done + done + if [ $archives_multiple -gt 1 ]; then + # there are some extra soft links that we should delete. + for f in $dir/egs.*.*.ark; do rm $f; done + fi + echo "$0: removing temporary" + # Ignore errors below because trans.* might not exist. + rm -f $dir/trans.{ark,scp} $dir/targets.*.scp 2>/dev/null +fi + +echo "$0: Finished preparing training examples" + diff --git a/egs/wsj/s5/steps/nnet3/get_successful_models.py b/egs/wsj/s5/steps/nnet3/get_successful_models.py new file mode 100755 index 00000000000..3661d91b8d5 --- /dev/null +++ b/egs/wsj/s5/steps/nnet3/get_successful_models.py @@ -0,0 +1,64 @@ +#!/usr/bin/env python + +from __future__ import print_function +import re +import os +import argparse +import sys +import warnings +import copy +import glob + + +if __name__ == "__main__": + # we add compulsory arguments as named arguments for readability + parser = argparse.ArgumentParser(description="Create a list of models suitable for averaging " + "based on their train objf values.", + epilog="See steps/nnet3/lstm/train.sh for example.") + + parser.add_argument("--difference-threshold", type=float, + help="The threshold for discarding models, " + "when objf of the model differs more than this value from the best model " + "it is discarded.", + default=1.0) + + parser.add_argument("num_models", type=int, + help="Number of models.") + + parser.add_argument("logfile_pattern", type=str, + help="Pattern for identifying the log-file names. " + "It specifies the entire log file name, except for the job number, " + "which is replaced with '%'. e.g. exp/nneet3/tdnn_sp/log/train.4.%.log") + + + args = parser.parse_args() + + assert(args.num_models > 0) + + parse_regex = re.compile("LOG .* Overall average objective function for 'output' is ([0-9e.\-+]+) over ([0-9e.\-+]+) frames") + loss = [] + for i in range(args.num_models): + model_num = i + 1 + logfile = re.sub('%', str(model_num), args.logfile_pattern) + lines = open(logfile, 'r').readlines() + this_loss = -100000 + for line_num in range(1, len(lines) + 1): + # we search from the end as this would result in + # lesser number of regex searches. Python regex is slow ! + mat_obj = parse_regex.search(lines[-1*line_num]) + if mat_obj is not None: + this_loss = float(mat_obj.groups()[0]) + break; + loss.append(this_loss); + max_index = loss.index(max(loss)) + accepted_models = [] + for i in range(args.num_models): + if (loss[max_index] - loss[i]) <= args.difference_threshold: + accepted_models.append(i+1) + + model_list = " ".join(map(lambda x: str(x), accepted_models)) + print(model_list) + + if len(accepted_models) != args.num_models: + print("WARNING: Only {0}/{1} of the models have been accepted for averaging, based on log files {2}.".format(len(accepted_models), args.num_models, args.logfile_pattern), file=sys.stderr) + print(" Using models {0}".format(model_list), file=sys.stderr) diff --git a/egs/wsj/s5/steps/nnet3/lstm/decode.sh b/egs/wsj/s5/steps/nnet3/lstm/decode.sh new file mode 100755 index 00000000000..07195c071d3 --- /dev/null +++ b/egs/wsj/s5/steps/nnet3/lstm/decode.sh @@ -0,0 +1,163 @@ +#!/bin/bash + +# Copyright 2012-2015 Johns Hopkins University (Author: Daniel Povey). +# Apache 2.0. + +# This script does decoding with a neural-net. If the neural net was built on +# top of fMLLR transforms from a conventional system, you should provide the +# --transform-dir option. + +# Begin configuration section. +stage=1 +transform_dir= # dir to find fMLLR transforms. +nj=4 # number of decoding jobs. If --transform-dir set, must match that number! +acwt=0.1 # Just a default value, used for adaptation and beam-pruning.. +cmd=run.pl +beam=15.0 +max_active=7000 +min_active=200 +ivector_scale=1.0 +lattice_beam=8.0 # Beam we use in lattice generation. +iter=final +num_threads=1 # if >1, will use gmm-latgen-faster-parallel +parallel_opts= # ignored now. +scoring_opts= +skip_scoring=false +feat_type= +online_ivector_dir= +minimize=false + +frames_per_chunk=10000 +extra_left_context=20 # it is recommended to use the same value as the chunk_left_context + # used during training +extra_right_context=0 # it is recommended to use the same value as the chunk_right_context + # used during training (usually used in bi-directional LSTM case) +# End configuration section. + +echo "$0 $@" # Print the command line for logging + +[ -f ./path.sh ] && . ./path.sh; # source the path. +. parse_options.sh || exit 1; + +if [ $# -ne 3 ]; then + echo "Usage: $0 [options] " + echo "e.g.: steps/nnet3/decode.sh --nj 8 \\" + echo "--online-ivector-dir exp/nnet2_online/ivectors_test_eval92 \\" + echo " exp/tri4b/graph_bg data/test_eval92_hires $dir/decode_bg_eval92" + echo "main options (for others, see top of script file)" + echo " --transform-dir # directory of previous decoding" + echo " # where we can find transforms for SAT systems." + echo " --config # config containing options" + echo " --nj # number of parallel jobs" + echo " --cmd # Command to run in parallel with" + echo " --beam # Decoding beam; default 15.0" + echo " --iter # Iteration of model to decode; default is final." + echo " --scoring-opts # options to local/score.sh" + echo " --num-threads # number of threads to use, default 1." + echo " --parallel-opts # e.g. '--num-threads 4' if you supply --num-threads 4" + exit 1; +fi + +graphdir=$1 +data=$2 +dir=$3 +srcdir=`dirname $dir`; # Assume model directory one level up from decoding directory. +model=$srcdir/$iter.mdl + + +[ ! -z "$online_ivector_dir" ] && \ + extra_files="$online_ivector_dir/ivector_online.scp $online_ivector_dir/ivector_period" + +for f in $graphdir/HCLG.fst $data/feats.scp $model $extra_files; do + [ ! -f $f ] && echo "$0: no such file $f" && exit 1; +done + +sdata=$data/split$nj; +cmvn_opts=`cat $srcdir/cmvn_opts` || exit 1; +thread_string= +[ $num_threads -gt 1 ] && thread_string="-parallel --num-threads=$num_threads" + +mkdir -p $dir/log +[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh --per-utt $data $nj || exit 1; +echo $nj > $dir/num_jobs + + +## Set up features. +if [ -z "$feat_type" ]; then + if [ -f $srcdir/final.mat ]; then feat_type=lda; else feat_type=raw; fi + echo "$0: feature type is $feat_type" +fi + +splice_opts=`cat $srcdir/splice_opts 2>/dev/null` + +case $feat_type in + raw) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- |";; + lda) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |" + ;; + *) echo "$0: invalid feature type $feat_type" && exit 1; +esac +if [ ! -z "$transform_dir" ]; then + echo "$0: using transforms from $transform_dir" + [ ! -s $transform_dir/num_jobs ] && \ + echo "$0: expected $transform_dir/num_jobs to contain the number of jobs." && exit 1; + nj_orig=$(cat $transform_dir/num_jobs) + + if [ $feat_type == "raw" ]; then trans=raw_trans; + else trans=trans; fi + if [ $feat_type == "lda" ] && \ + ! cmp $transform_dir/../final.mat $srcdir/final.mat && \ + ! cmp $transform_dir/final.mat $srcdir/final.mat; then + echo "$0: LDA transforms differ between $srcdir and $transform_dir" + exit 1; + fi + if [ ! -f $transform_dir/$trans.1 ]; then + echo "$0: expected $transform_dir/$trans.1 to exist (--transform-dir option)" + exit 1; + fi + if [ $nj -ne $nj_orig ]; then + # Copy the transforms into an archive with an index. + for n in $(seq $nj_orig); do cat $transform_dir/$trans.$n; done | \ + copy-feats ark:- ark,scp:$dir/$trans.ark,$dir/$trans.scp || exit 1; + feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk scp:$dir/$trans.scp ark:- ark:- |" + else + # number of jobs matches with alignment dir. + feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$transform_dir/$trans.JOB ark:- ark:- |" + fi +elif grep 'transform-feats --utt2spk' $srcdir/log/train.1.log >&/dev/null; then + echo "$0: **WARNING**: you seem to be using a neural net system trained with transforms," + echo " but you are not providing the --transform-dir option in test time." +fi +## + +if [ ! -z "$online_ivector_dir" ]; then + ivector_period=$(cat $online_ivector_dir/ivector_period) || exit 1; + ivector_opts="--online-ivectors=scp:$online_ivector_dir/ivector_online.scp --online-ivector_period=$ivector_period" +fi + +if [ $stage -le 1 ]; then + $cmd --num-threads $num_threads JOB=1:$nj $dir/log/decode.JOB.log \ + nnet3-latgen-faster$thread_string $ivector_opts \ + --frames-per-chunk=$frames_per_chunk \ + --minimize=$minimize --max-active=$max_active --min-active=$min_active --beam=$beam \ + --lattice-beam=$lattice_beam --acoustic-scale=$acwt --allow-partial=true \ + --extra-left-context=$extra_left_context \ + --extra-right-context=$extra_right_context \ + --word-symbol-table=$graphdir/words.txt "$model" \ + $graphdir/HCLG.fst "$feats" "ark:|gzip -c > $dir/lat.JOB.gz" || exit 1; +fi + +# The output of this script is the files "lat.*.gz"-- we'll rescore this at +# different acoustic scales to get the final output. + + +if [ $stage -le 2 ]; then + if ! $skip_scoring ; then + [ ! -x local/score.sh ] && \ + echo "Not scoring because local/score.sh does not exist or not executable." && exit 1; + echo "score best paths" + local/score.sh $scoring_opts --cmd "$cmd" $data $graphdir $dir + echo "score confidence and timing with sclite" + fi +fi +echo "Decoding done." +exit 0; diff --git a/egs/wsj/s5/steps/nnet3/lstm/make_configs.py b/egs/wsj/s5/steps/nnet3/lstm/make_configs.py index 3c7c2e2c975..9c2c641b0e9 100755 --- a/egs/wsj/s5/steps/nnet3/lstm/make_configs.py +++ b/egs/wsj/s5/steps/nnet3/lstm/make_configs.py @@ -6,218 +6,160 @@ import sys import warnings import copy +import imp -# adds the input nodes and returns the descriptor -def AddInputLayer(config_lines, feat_dim, splice_indexes=[0], ivector_dim=0): - components = config_lines['components'] - component_nodes = config_lines['component-nodes'] - output_dim = 0 - components.append('input-node name=input dim=' + str(feat_dim)) - list = [('Offset(input, {0})'.format(n) if n != 0 else 'input') for n in splice_indexes] - output_dim += len(splice_indexes) * feat_dim - if args.ivector_dim > 0: - components.append('input-node name=ivector dim=' + str(ivector_dim)) - list.append('ReplaceIndex(ivector, t, 0)') - output_dim += ivector_dim - splice_descriptor = "Append({0})".format(", ".join(list)) - print(splice_descriptor) - return {'descriptor': splice_descriptor, - 'dimension': output_dim} - -def AddLdaLayer(config_lines, name, input, lda_file): - components = config_lines['components'] - component_nodes = config_lines['component-nodes'] - - components.append('component name={0}_lda type=FixedAffineComponent matrix={1}'.format(name, lda_file)) - component_nodes.append('component-node name={0}_lda component={0}_lda input={1}'.format(name, input['descriptor'])) - - return {'descriptor': '{0}_lda'.format(name), - 'dimension': input['dimension']} - -def AddAffineLayer(config_lines, name, input, output_dim): - components = config_lines['components'] - component_nodes = config_lines['component-nodes'] - - components.append("component name={0}_affine type=NaturalGradientAffineComponent input-dim={1} output-dim={2}".format(name, input['dimension'], output_dim)) - component_nodes.append("component-node name={0}_affine component={0}_affine input={1}".format(name, input['descriptor'])) - - return {'descriptor': '{0}_affine'.format(name), - 'dimension': output_dim} - -def AddAffRelNormLayer(config_lines, name, input, output_dim): - components = config_lines['components'] - component_nodes = config_lines['component-nodes'] - - components.append("component name={0}_affine type=NaturalGradientAffineComponent input-dim={1} output-dim={2}".format(name, input['dimension'], output_dim)) - components.append("component name={0}_relu type=RectifiedLinearComponent dim={1}".format(name, output_dim)) - components.append("component name={0}_renorm type=NormalizeComponent dim={1}".format(name, output_dim)) - - component_nodes.append("component-node name={0}_affine component={0}_affine input={1}".format(name, input['descriptor'])) - component_nodes.append("component-node name={0}_relu component={0}_relu input={0}_affine".format(name)) - component_nodes.append("component-node name={0}_renorm component={0}_renorm input={0}_relu".format(name)) - - return {'descriptor': '{0}_renorm'.format(name), - 'dimension': output_dim} - - - -def AddSoftmaxLayer(config_lines, name, input): - components = config_lines['components'] - component_nodes = config_lines['component-nodes'] - - components.append("component name={0}_log_softmax type=LogSoftmaxComponent dim={1}".format(name, input['dimension'])) - component_nodes.append("component-node name={0}_log_softmax component={0}_log_softmax input={1}".format(name, input['descriptor'])) - - return {'descriptor': '{0}_log_softmax'.format(name), - 'dimension': input['dimension']} - -def AddOutputNode(config_lines, input): - components = config_lines['components'] - component_nodes = config_lines['component-nodes'] - component_nodes.append('output-node name=output input={0}'.format(input['descriptor'])) - -def AddFinalLayer(config_lines, input, output_dim): - prev_layer_output = AddAffineLayer(config_lines, "Final", input, output_dim) - prev_layer_output = AddSoftmaxLayer(config_lines, "Final", prev_layer_output) - AddOutputNode(config_lines, prev_layer_output) - -def AddLstmLayer(config_lines, - name, input, cell_dim, - recurrent_projection_dim = 0, - non_recurrent_projection_dim = 0): - assert(recurrent_projection_dim >= 0 and non_recurrent_projection_dim >= 0) - components = config_lines['components'] - component_nodes = config_lines['component-nodes'] - - input_descriptor = input['descriptor'] - input_dim = input['dimension'] - name = name.strip() - - if (recurrent_projection_dim == 0): - add_recurrent_projection = False - recurrent_projection_dim = cell_dim - recurrent_connection = "m_t" - else: - add_recurrent_projection = True - recurrent_connection = "r_t" - if (non_recurrent_projection_dim == 0): - add_non_recurrent_projection = False - else: - add_non_recurrent_projection = True - - - # Parameter Definitions W*(* replaced by - to have valid names) - components.append("# Input gate control : W_i* matrices") - components.append("component name={0}_W_i-xr type=NaturalGradientAffineComponent input-dim={1} output-dim={2}".format(name, input_dim + recurrent_projection_dim, cell_dim)) - components.append("# note : the cell outputs pass through a diagonal matrix") - components.append("component name={0}_w_ic type=NaturalGradientPerElementScaleComponent dim={1}".format(name, cell_dim)) - - components.append("# Forget gate control : W_f* matrices") - components.append("component name={0}_W_f-xr type=NaturalGradientAffineComponent input-dim={1} output-dim={2}".format(name, input_dim + recurrent_projection_dim, cell_dim)) - components.append("# note : the cell outputs pass through a diagonal matrix") - components.append("component name={0}_w_fc type=NaturalGradientPerElementScaleComponent dim={1}".format(name, cell_dim)) - - components.append("# Output gate control : W_o* matrices") - components.append("component name={0}_W_o-xr type=NaturalGradientAffineComponent input-dim={1} output-dim={2}".format(name, input_dim + recurrent_projection_dim, cell_dim)) - components.append("# note : the cell outputs pass through a diagonal matrix") - components.append("component name={0}_w_oc type=NaturalGradientPerElementScaleComponent dim={1}".format(name, cell_dim)) - - components.append("# Cell input matrices : W_c* matrices") - components.append("component name={0}_W_c-xr type=NaturalGradientAffineComponent input-dim={1} output-dim={2}".format(name, input_dim + recurrent_projection_dim, cell_dim)) - - if add_recurrent_projection and add_non_recurrent_projection: - components.append("# projection matrices : Wrm and Wpm") - components.append("component name={0}_W-m type=NaturalGradientAffineComponent input-dim={1} output-dim={2}".format(name, cell_dim, recurrent_projection_dim + non_recurrent_projection_dim)) - - elif add_recurrent_projection : - components.append("# projection matrices : Wrm") - components.append("component name={0}_Wrm type=NaturalGradientAffineComponent input-dim={1} output-dim={2}".format(name, cell_dim, recurrent_projection_dim)) - - components.append("# Defining the non-linearities") - components.append("component name={0}_i type=SigmoidComponent dim={1}".format(name, cell_dim)) - components.append("component name={0}_f type=SigmoidComponent dim={1}".format(name, cell_dim)) - components.append("component name={0}_o type=SigmoidComponent dim={1}".format(name, cell_dim)) - components.append("component name={0}_g type=TanhComponent dim={1}".format(name, cell_dim)) - components.append("component name={0}_h type=TanhComponent dim={1}".format(name, cell_dim)) - - components.append("# Defining the cell computations") - components.append("component name={0}_c1 type=ElementwiseProductComponent input-dim={1} output-dim={2}".format(name, 2 * cell_dim, cell_dim)) - components.append("component name={0}_c2 type=ElementwiseProductComponent input-dim={1} output-dim={2}".format(name, 2 * cell_dim, cell_dim)) - components.append("component name={0}_m type=ElementwiseProductComponent input-dim={1} output-dim={2}".format(name, 2 * cell_dim, cell_dim)) - - # c1_t and c2_t defined below - c_tminus1_descriptor = "Sum(IfDefined(Offset({0}_c1_t, -1)), IfDefined(Offset( {0}_c2_t, -1)))".format(name) - - component_nodes.append("# i_t") - component_nodes.append("component-node name={0}_i1 component={0}_W_i-xr input=Append({1}, IfDefined(Offset({0}_{2}, -1)))".format(name, input_descriptor, recurrent_connection)) - component_nodes.append("component-node name={0}_i2 component={0}_w_ic input={1}".format(name, c_tminus1_descriptor)) - component_nodes.append("component-node name={0}_i_t component={0}_i input=Sum({0}_i1, {0}_i2)".format(name)) - - component_nodes.append("# f_t") - component_nodes.append("component-node name={0}_f1 component={0}_W_f-xr input=Append({1}, IfDefined(Offset({0}_{2}, -1)))".format(name, input_descriptor, recurrent_connection)) - component_nodes.append("component-node name={0}_f2 component={0}_w_fc input={1}".format(name, c_tminus1_descriptor)) - component_nodes.append("component-node name={0}_f_t component={0}_f input=Sum({0}_f1,{0}_f2)".format(name)) - - component_nodes.append("# o_t") - component_nodes.append("component-node name={0}_o1 component={0}_W_o-xr input=Append({1}, IfDefined(Offset({0}_{2}, -1)))".format(name, input_descriptor, recurrent_connection)) - component_nodes.append("component-node name={0}_o2 component={0}_w_oc input=Sum({0}_c1_t, {0}_c2_t)".format(name)) - component_nodes.append("component-node name={0}_o_t component={0}_o input=Sum({0}_o1, {0}_o2)".format(name)) - - component_nodes.append("# h_t") - component_nodes.append("component-node name={0}_h_t component={0}_h input=Sum({0}_c1_t, {0}_c2_t)".format(name)) - - component_nodes.append("# g_t") - component_nodes.append("component-node name={0}_g1 component={0}_W_c-xr input=Append({1}, IfDefined(Offset({0}_{2}, -1)))".format(name, input_descriptor, recurrent_connection)) - component_nodes.append("component-node name={0}_g_t component={0}_g input={0}_g1".format(name)) - - component_nodes.append("# parts of c_t") - component_nodes.append("component-node name={0}_c1_t component={0}_c1 input=Append({0}_f_t, {1})".format(name, c_tminus1_descriptor)) - component_nodes.append("component-node name={0}_c2_t component={0}_c2 input=Append({0}_i_t, {0}_g_t)".format(name)) - - component_nodes.append("# m_t") - component_nodes.append("component-node name={0}_m_t component={0}_m input=Append({0}_o_t, {0}_h_t)".format(name)) - - if (add_recurrent_projection and add_non_recurrent_projection): - component_nodes.append("# r_t and p_t") - component_nodes.append("component-node name={0}_rp_t component={0}_W-m input={0}_m_t".format(name)) - component_nodes.append("dim-range-node name={0}_r_t input-node={0}_rp_t dim-offset=0 dim={1}".format(name, recurrent_projection_dim)) - output_descriptor = '{0}_rp_t'.format(name) - output_dim = recurrent_projection_dim + non_recurrent_projection_dim - - elif add_recurrent_projection: - component_nodes.append("# r_t") - component_nodes.append("component-node name={0}_r_t component={0}_Wrm input={0}_m_t".format(name)) - output_descriptor = '{0}_r_t'.format(name) - output_dim = recurrent_projection_dim +nodes = imp.load_source('nodes', 'steps/nnet3/components.py') +nnet3_train_lib = imp.load_source('ntl', 'steps/nnet3/nnet3_train_lib.py') +chain_lib = imp.load_source('ncl', 'steps/nnet3/chain/nnet3_chain_lib.py') + +def GetArgs(): + # we add compulsary arguments as named arguments for readability + parser = argparse.ArgumentParser(description="Writes config files and variables " + "for LSTMs creation and training", + epilog="See steps/nnet3/lstm/train.sh for example.") + + # Only one of these arguments can be specified, and one of them has to + # be compulsarily specified + feat_group = parser.add_mutually_exclusive_group(required = True) + feat_group.add_argument("--feat-dim", type=int, + help="Raw feature dimension, e.g. 13") + feat_group.add_argument("--feat-dir", type=str, + help="Feature directory, from which we derive the feat-dim") + + # only one of these arguments can be specified + ivector_group = parser.add_mutually_exclusive_group(required = False) + ivector_group.add_argument("--ivector-dim", type=int, + help="iVector dimension, e.g. 100", default=0) + ivector_group.add_argument("--ivector-dir", type=str, + help="iVector dir, which will be used to derive the ivector-dim ", default=None) + + num_target_group = parser.add_mutually_exclusive_group(required = True) + num_target_group.add_argument("--num-targets", type=int, + help="number of network targets (e.g. num-pdf-ids/num-leaves)") + num_target_group.add_argument("--ali-dir", type=str, + help="alignment directory, from which we derive the num-targets") + num_target_group.add_argument("--tree-dir", type=str, + help="directory with final.mdl, from which we derive the num-targets") + + # General neural network options + parser.add_argument("--splice-indexes", type=str, + help="Splice indexes at input layer, e.g. '-3,-2,-1,0,1,2,3'", required = True, default="0") + parser.add_argument("--xent-regularize", type=float, + help="For chain models, if nonzero, add a separate output for cross-entropy " + "regularization (with learning-rate-factor equal to the inverse of this)", + default=0.0) + parser.add_argument("--include-log-softmax", type=str, action=nnet3_train_lib.StrToBoolAction, + help="add the final softmax layer ", default=True, choices = ["false", "true"]) + + # LSTM options + parser.add_argument("--num-lstm-layers", type=int, + help="Number of LSTM layers to be stacked", default=1) + parser.add_argument("--cell-dim", type=int, + help="dimension of lstm-cell") + parser.add_argument("--recurrent-projection-dim", type=int, + help="dimension of recurrent projection") + parser.add_argument("--non-recurrent-projection-dim", type=int, + help="dimension of non-recurrent projection") + parser.add_argument("--hidden-dim", type=int, + help="dimension of fully-connected layers") + + # Natural gradient options + parser.add_argument("--ng-per-element-scale-options", type=str, + help="options to be supplied to NaturalGradientPerElementScaleComponent", default="") + parser.add_argument("--ng-affine-options", type=str, + help="options to be supplied to NaturalGradientAffineComponent", default="") + + # Gradient clipper options + parser.add_argument("--norm-based-clipping", type=str, action=nnet3_train_lib.StrToBoolAction, + help="use norm based clipping in ClipGradient components ", default=True, choices = ["false", "true"]) + parser.add_argument("--clipping-threshold", type=float, + help="clipping threshold used in ClipGradient components, if clipping-threshold=0 no clipping is done", default=30) + parser.add_argument("--self-repair-scale", type=float, + help="A non-zero value activates the self-repair mechanism in the sigmoid and tanh non-linearities of the LSTM", default=None) + + # Delay options + parser.add_argument("--label-delay", type=int, default=None, + help="option to delay the labels to make the lstm robust") + + parser.add_argument("--lstm-delay", type=str, default=None, + help="option to have different delays in recurrence for each lstm") + + parser.add_argument("config_dir", + help="Directory to write config files and variables") + + print(' '.join(sys.argv)) + + args = parser.parse_args() + args = CheckArgs(args) + return args + +def CheckArgs(args): + if not os.path.exists(args.config_dir): + os.makedirs(args.config_dir) + + ## Check arguments. + if args.feat_dir is not None: + args.feat_dim = nnet3_train_lib.GetFeatDim(args.feat_dir) + + if args.ali_dir is not None: + args.num_targets = nnet3_train_lib.GetNumberOfLeaves(args.ali_dir) + elif args.tree_dir is not None: + args.num_targets = chain_lib.GetNumberOfLeaves(args.tree_dir) + + if args.ivector_dir is not None: + args.ivector_dim = nnet3_train_lib.GetIvectorDim(args.ivector_dir) + + if not args.feat_dim > 0: + raise Exception("feat-dim has to be postive") + + if not args.num_targets > 0: + print(args.num_targets) + raise Exception("num_targets has to be positive") + + if not args.ivector_dim >= 0: + raise Exception("ivector-dim has to be non-negative") + + if (args.num_lstm_layers < 1): + sys.exit("--num-lstm-layers has to be a positive integer") + if (args.clipping_threshold < 0): + sys.exit("--clipping-threshold has to be a non-negative") + if args.lstm_delay is None: + args.lstm_delay = [[-1]] * args.num_lstm_layers else: - output_descriptor = '{0}_m_t'.format(name) - output_dim = cell_dim + try: + args.lstm_delay = ParseLstmDelayString(args.lstm_delay.strip()) + except ValueError: + sys.exit("--lstm-delay has incorrect format value. Provided value is '{0}'".format(args.lstm_delay)) + if len(args.lstm_delay) != args.num_lstm_layers: + sys.exit("--lstm-delay: Number of delays provided has to match --num-lstm-layers") - return { - 'descriptor': output_descriptor, - 'dimension':output_dim - } + return args def PrintConfig(file_name, config_lines): f = open(file_name, 'w') f.write("\n".join(config_lines['components'])+"\n") f.write("\n#Component nodes\n") - f.write("\n".join(config_lines['component-nodes'])) + f.write("\n".join(config_lines['component-nodes'])+"\n") f.close() -def ParseSpliceString(splice_indexes): +def ParseSpliceString(splice_indexes, label_delay=None): ## Work out splice_array e.g. splice_array = [ [ -3,-2,...3 ], [0], [-2,2], .. [ -8,8 ] ] split1 = splice_indexes.split(" "); # we already checked the string is nonempty. if len(split1) < 1: splice_indexes = "0" - left_context = 0 - right_context = 0 + left_context=0 + right_context=0 + if label_delay is not None: + left_context = -label_delay + right_context = label_delay + splice_array = [] try: for i in range(len(split1)): - indexes = map(lambda x: int(x), split1[i].split(",")) + indexes = map(lambda x: int(x), split1[i].strip().split(",")) + print(indexes) if len(indexes) < 1: raise ValueError("invalid --splice-indexes argument, too-short element: " + splice_indexes) @@ -232,7 +174,7 @@ def ParseSpliceString(splice_indexes): right_context += indexes[-1] splice_array.append(indexes) except ValueError as e: - raise ValueError("invalid --splice-indexes argument " + splice_indexes + e) + raise ValueError("invalid --splice-indexes argument " + splice_indexes + str(e)) left_context = max(0, left_context) right_context = max(0, right_context) @@ -243,111 +185,146 @@ def ParseSpliceString(splice_indexes): 'num_hidden_layers':len(splice_array) } -if __name__ == "__main__": - # we add compulsary arguments as named arguments for readability - parser = argparse.ArgumentParser(description="Writes config files and variables " - "for LSTMs creation and training", - epilog="See steps/nnet3/lstm/train.sh for example.") - parser.add_argument("--splice-indexes", type=str, - help="Splice indexes at input layer, e.g. '-3,-2,-1,0,1,2,3' [compulsary argument]", default="0") - parser.add_argument("--num-lstm-layers", type=int, - help="Number of LSTM layers to be stacked", default=1) - parser.add_argument("--feat-dim", type=int, - help="Raw feature dimension, e.g. 13") - parser.add_argument("--ivector-dim", type=int, - help="iVector dimension, e.g. 100", default=0) - parser.add_argument("--cell-dim", type=int, - help="dimension of lstm-cell") - parser.add_argument("--recurrent-projection-dim", type=int, - help="dimension of recurrent projection") - parser.add_argument("--non-recurrent-projection-dim", type=int, - help="dimension of non-recurrent projection") - parser.add_argument("--hidden-dim", type=int, - help="dimension of fully-connected layers") - parser.add_argument("--bptt-truncation-width", type=int, - help="number of time steps through which gradient is backpropagated", default=20) - parser.add_argument("--context-sensitive-chunk-width", type=int, - help="number of frames used to estimate the state of the first frame in truncated BPTT ", default=20) - parser.add_argument("--num-targets", type=int, - help="number of network targets (e.g. num-pdf-ids/num-leaves)") - parser.add_argument("config_dir", - help="Directory to write config files and variables") - - print(' '.join(sys.argv)) - - args = parser.parse_args() - - if not os.path.exists(args.config_dir): - os.makedirs(args.config_dir) - - ## Check arguments. - if args.splice_indexes is None: - sys.exit("--splice-indexes argument is required") - if args.feat_dim is None or not (args.feat_dim > 0): - sys.exit("--feat-dim argument is required") - if args.num_targets is None or not (args.num_targets > 0): - sys.exit("--feat-dim argument is required") - if (args.num_lstm_layers < 1): - sys.exit("--num-lstm-layers has to be a positive integer") - if (args.bptt_truncation_width < 1): - sys.exit("--bptt-truncation-width has to be a positive integer") - if (args.context_sensitive_chunk_width < 0): - sys.exit("--context-sensitive-chunk-width has to be a non-negative integer") - - - - parsed_splice_output = ParseSpliceString(args.splice_indexes) - left_context = parsed_splice_output['left_context'] - right_context = parsed_splice_output['right_context'] - num_hidden_layers = parsed_splice_output['num_hidden_layers'] - splice_indexes = parsed_splice_output['splice_indexes'] +def ParseLstmDelayString(lstm_delay): + ## Work out lstm_delay e.g. "-1 [-1,1] -2" -> list([ [-1], [-1, 1], [-2] ]) + split1 = lstm_delay.split(" "); + lstm_delay_array = [] + try: + for i in range(len(split1)): + indexes = map(lambda x: int(x), split1[i].strip().lstrip('[').rstrip(']').strip().split(",")) + if len(indexes) < 1: + raise ValueError("invalid --lstm-delay argument, too-short element: " + + lstm_delay) + elif len(indexes) == 2 and indexes[0] * indexes[1] >= 0: + raise ValueError('Warning: ' + str(indexes) + ' is not a standard BLSTM mode. There should be a negative delay for the forward, and a postive delay for the backward.') + lstm_delay_array.append(indexes) + except ValueError as e: + raise ValueError("invalid --lstm-delay argument " + lstm_delay + str(e)) - if (num_hidden_layers < args.num_lstm_layers): - sys.exit("--num-lstm-layers : number of lstm layers has to be greater than number of layers, decided based on splice-indexes") + return lstm_delay_array - left_context = left_context + args.bptt_truncation_width + args.context_sensitive_chunk_width - right_context = right_context - # write the files used by other scripts like steps/nnet3/get_egs.sh - f = open(args.config_dir + "/vars", "w") - print('left_context=' + str(left_context), file=f) - print('right_context=' + str(right_context), file=f) - print('num_hidden_layers=' + str(num_hidden_layers), file=f) - # print('initial_right_context=' + str(splice_array[0][-1]), file=f) - f.close() +def MakeConfigs(config_dir, feat_dim, ivector_dim, num_targets, + splice_indexes, lstm_delay, cell_dim, + recurrent_projection_dim, non_recurrent_projection_dim, + num_lstm_layers, num_hidden_layers, + norm_based_clipping, clipping_threshold, + ng_per_element_scale_options, ng_affine_options, + label_delay, include_log_softmax, xent_regularize, self_repair_scale): config_lines = {'components':[], 'component-nodes':[]} config_files={} - prev_layer_output = AddInputLayer(config_lines, args.feat_dim, splice_indexes[0], args.ivector_dim) + prev_layer_output = nodes.AddInputLayer(config_lines, feat_dim, splice_indexes[0], ivector_dim) # Add the init config lines for estimating the preconditioning matrices init_config_lines = copy.deepcopy(config_lines) init_config_lines['components'].insert(0, '# Config file for initializing neural network prior to') init_config_lines['components'].insert(0, '# preconditioning matrix computation') - AddOutputNode(init_config_lines, prev_layer_output) - config_files[args.config_dir + '/init.config'] = init_config_lines - - prev_layer_output = AddLdaLayer(config_lines, "L0", prev_layer_output, args.config_dir + '/lda.mat') - - for i in range(args.num_lstm_layers): - prev_layer_output = AddLstmLayer(config_lines, "Lstm{0}".format(i+1), prev_layer_output, args.cell_dim, - args.recurrent_projection_dim, args.non_recurrent_projection_dim) + nodes.AddOutputLayer(init_config_lines, prev_layer_output) + config_files[config_dir + '/init.config'] = init_config_lines + + prev_layer_output = nodes.AddLdaLayer(config_lines, "L0", prev_layer_output, config_dir + '/lda.mat') + + for i in range(num_lstm_layers): + if len(lstm_delay[i]) == 2: # BLSTM layer case, add both forward and backward + prev_layer_output1 = nodes.AddLstmLayer(config_lines, "BLstm{0}_forward".format(i+1), prev_layer_output, cell_dim, + recurrent_projection_dim, non_recurrent_projection_dim, + clipping_threshold, norm_based_clipping, + ng_per_element_scale_options, ng_affine_options, + lstm_delay = lstm_delay[i][0], self_repair_scale = self_repair_scale) + prev_layer_output2 = nodes.AddLstmLayer(config_lines, "BLstm{0}_backward".format(i+1), prev_layer_output, cell_dim, + recurrent_projection_dim, non_recurrent_projection_dim, + clipping_threshold, norm_based_clipping, + ng_per_element_scale_options, ng_affine_options, + lstm_delay = lstm_delay[i][1], self_repair_scale = self_repair_scale) + prev_layer_output['descriptor'] = 'Append({0}, {1})'.format(prev_layer_output1['descriptor'], prev_layer_output2['descriptor']) + prev_layer_output['dimension'] = prev_layer_output1['dimension'] + prev_layer_output2['dimension'] + else: # LSTM layer case + prev_layer_output = nodes.AddLstmLayer(config_lines, "Lstm{0}".format(i+1), prev_layer_output, cell_dim, + recurrent_projection_dim, non_recurrent_projection_dim, + clipping_threshold, norm_based_clipping, + ng_per_element_scale_options, ng_affine_options, + lstm_delay = lstm_delay[i][0], self_repair_scale = self_repair_scale) # make the intermediate config file for layerwise discriminative # training - AddFinalLayer(config_lines, prev_layer_output, args.num_targets) - config_files['{0}/layer{1}.config'.format(args.config_dir, i+1)] = config_lines - config_lines = {'components':[], 'component-nodes':[]} + nodes.AddFinalLayer(config_lines, prev_layer_output, num_targets, ng_affine_options, label_delay = label_delay, include_log_softmax = include_log_softmax) - for i in range(args.num_lstm_layers, num_hidden_layers): - prev_layer_output = AddAffRelNormLayer(config_lines, "L{0}".format(i+1), prev_layer_output, args.hidden_dim) + + if xent_regularize != 0.0: + nodes.AddFinalLayer(config_lines, prev_layer_output, num_targets, + include_log_softmax = True, + name_affix = 'xent') + + config_files['{0}/layer{1}.config'.format(config_dir, i+1)] = config_lines + config_lines = {'components':[], 'component-nodes':[]} + if len(lstm_delay[i]) == 2: + # since the form 'Append(Append(xx, yy), zz)' is not allowed, here we don't wrap the descriptor with 'Append()' so that we would have the form + # 'Append(xx, yy, zz)' in the next lstm layer + prev_layer_output['descriptor'] = '{0}, {1}'.format(prev_layer_output1['descriptor'], prev_layer_output2['descriptor']) + + if len(lstm_delay[i]) == 2: + # since there is no 'Append' in 'AffRelNormLayer', here we wrap the descriptor with 'Append()' + prev_layer_output['descriptor'] = 'Append({0})'.format(prev_layer_output['descriptor']) + for i in range(num_lstm_layers, num_hidden_layers): + prev_layer_output = nodes.AddAffRelNormLayer(config_lines, "L{0}".format(i+1), + prev_layer_output, hidden_dim, + ng_affine_options, self_repair_scale = self_repair_scale) # make the intermediate config file for layerwise discriminative # training - AddFinalLayer(config_lines, prev_layer_output, args.num_targets) - config_files['{0}/layer{1}.config'.format(args.config_dir, i+1)] = config_lines + nodes.AddFinalLayer(config_lines, prev_layer_output, num_targets, ng_affine_options, label_delay = label_delay, include_log_softmax = include_log_softmax) + + if xent_regularize != 0.0: + nodes.AddFinalLayer(config_lines, prev_layer_output, num_targets, + include_log_softmax = True, + name_affix = 'xent') + + config_files['{0}/layer{1}.config'.format(config_dir, i+1)] = config_lines config_lines = {'components':[], 'component-nodes':[]} # printing out the configs # init.config used to train lda-mllt train for key in config_files.keys(): PrintConfig(key, config_files[key]) + + + + +def ProcessSpliceIndexes(config_dir, splice_indexes, label_delay, num_lstm_layers): + parsed_splice_output = ParseSpliceString(splice_indexes.strip(), label_delay) + left_context = parsed_splice_output['left_context'] + right_context = parsed_splice_output['right_context'] + num_hidden_layers = parsed_splice_output['num_hidden_layers'] + splice_indexes = parsed_splice_output['splice_indexes'] + + if (num_hidden_layers < num_lstm_layers): + raise Exception("num-lstm-layers : number of lstm layers has to be greater than number of layers, decided based on splice-indexes") + + # write the files used by other scripts like steps/nnet3/get_egs.sh + f = open(config_dir + "/vars", "w") + print('model_left_context=' + str(left_context), file=f) + print('model_right_context=' + str(right_context), file=f) + print('num_hidden_layers=' + str(num_hidden_layers), file=f) + # print('initial_right_context=' + str(splice_array[0][-1]), file=f) + f.close() + + return [left_context, right_context, num_hidden_layers, splice_indexes] + + +def Main(): + args = GetArgs() + [left_context, right_context, num_hidden_layers, splice_indexes] = ProcessSpliceIndexes(args.config_dir, args.splice_indexes, args.label_delay, args.num_lstm_layers) + + MakeConfigs(args.config_dir, + args.feat_dim, args.ivector_dim, args.num_targets, + splice_indexes, args.lstm_delay, args.cell_dim, + args.recurrent_projection_dim, args.non_recurrent_projection_dim, + args.num_lstm_layers, num_hidden_layers, + args.norm_based_clipping, + args.clipping_threshold, + args.ng_per_element_scale_options, args.ng_affine_options, + args.label_delay, args.include_log_softmax, args.xent_regularize, + args.self_repair_scale) + +if __name__ == "__main__": + Main() diff --git a/egs/wsj/s5/steps/nnet3/lstm/train.sh b/egs/wsj/s5/steps/nnet3/lstm/train.sh index fade7ef454d..3a1c7f14535 100755 --- a/egs/wsj/s5/steps/nnet3/lstm/train.sh +++ b/egs/wsj/s5/steps/nnet3/lstm/train.sh @@ -1,33 +1,29 @@ #!/bin/bash - -# Copyright 2012-2015 Johns Hopkins University (Author: Daniel Povey). +# Copyright 2012-2015 Johns Hopkins University (Author: Daniel Povey). # 2013 Xiaohui Zhang # 2013 Guoguo Chen # 2014 Vimal Manohar # 2014-2015 Vijayaditya Peddinti # Apache 2.0. +# Terminology: +# sample - one input-output tuple, which is an input sequence and output sequence for LSTM +# frame - one output label and the input context used to compute it # Begin configuration section. cmd=run.pl -num_epochs=15 # Number of epochs of training; +num_epochs=10 # Number of epochs of training; # the number of iterations is worked out from this. -initial_effective_lrate=0.01 -final_effective_lrate=0.001 -rand_prune=4.0 # Relates to a speedup we do for LDA. -minibatch_size=512 # This default is suitable for GPU-based training. - # Set it to 128 for multi-threaded CPU-based training. - -samples_per_iter=400000 # each iteration of training, see this many samples - # per job. This option is passed to get_egs.sh -num_jobs_initial=1 # Number of neural net jobs to run in parallel at the start of training +initial_effective_lrate=0.0003 +final_effective_lrate=0.00003 +num_jobs_initial=1 # Number of neural net jobs to run in parallel at the start of training num_jobs_final=8 # Number of neural net jobs to run in parallel at the end of training -prior_subset_size=20000 # 20k samples per job, for computing priors. +prior_subset_size=20000 # 20k samples per job, for computing priors. num_jobs_compute_prior=10 # these are single-threaded, run on CPU. get_egs_stage=0 # can be used for rerunning after partial online_ivector_dir= -presoftmax_prior_scale_power=-0.25 +presoftmax_prior_scale_power=-0.25 # we haven't yet used pre-softmax prior scaling in the LSTM model remove_egs=true # set to false to disable removing egs after training is done. max_models_combine=20 # The "max_models_combine" is the maximum number of models we give @@ -48,45 +44,75 @@ stage=-6 exit_stage=-100 # you can set this to terminate the training early. Exits before running this stage # count space-separated fields in splice_indexes to get num-hidden-layers. -splice_indexes="-4,-3,-2,-1,0,1,2,3,4 0 0 0 0 0" +splice_indexes="-2,-1,0,1,2 0 0" # Format : layer/....layer/ " # note: hidden layers which are composed of one or more components, # so hidden layer indexing is different from component count + # LSTM parameters num_lstm_layers=3 cell_dim=1024 # dimension of the LSTM cell hidden_dim=1024 # the dimension of the fully connected hidden layer outputs -recurrent_projection_dim=256 +recurrent_projection_dim=256 non_recurrent_projection_dim=256 -bptt_truncation_width=20 # number of BPTT steps -context_sensitive_chunk_width=20 # number of steps used in the estimation of the first LSTM state - # see Chen 2015, "Training Deep Bidirectional LSTM Acoustic Model for LVCSR by a Context-Sensitive-Chunk BPTT Approach" - - -io_opts="-tc 5" # for jobs with a lot of I/O, limits the number running at one time. These don't -randprune=4.0 # speeds up LDA. -affine_opts= - +norm_based_clipping=true # if true norm_based_clipping is used. + # In norm-based clipping the activation Jacobian matrix + # for the recurrent connections in the network is clipped + # to ensure that the individual row-norm (l2) does not increase + # beyond the clipping_threshold. + # If false, element-wise clipping is used. +clipping_threshold=30 # if norm_based_clipping is true this would be the maximum value of the row l2-norm, + # else this is the max-absolute value of each element in Jacobian. +chunk_width=20 # number of output labels in the sequence used to train an LSTM + # Caution: if you double this you should halve --samples-per-iter. +chunk_left_context=40 # number of steps used in the estimation of LSTM state before prediction of the first label +chunk_right_context=0 # number of steps used in the estimation of LSTM state before prediction of the first label (usually used in bi-directional LSTM case) +label_delay=5 # the lstm output is used to predict the label with the specified delay +lstm_delay=" -1 -2 -3 " # the delay to be used in the recurrence of lstms + # "-1 -2 -3" means the a three layer stacked LSTM would use recurrence connections with + # delays -1, -2 and -3 at layer1 lstm, layer2 lstm and layer3 lstm respectively + # "[-1,1] [-2,2] [-3,3]" means a three layer stacked bi-directional LSTM would use recurrence + # connections with delay -1 for the forward, 1 for the backward at layer1, + # -2 for the forward, 2 for the backward at layer2, and so on at layer3 +num_bptt_steps= # this variable counts the number of time steps to back-propagate from the last label in the chunk + # it is usually same as chunk_width + + +# nnet3-train options +shrink=0.99 # this parameter would be used to scale the parameter matrices +shrink_threshold=0.15 # a value less than 0.25 that we compare the mean of + # 'deriv-avg' for sigmoid components with, and if it's + # less, we shrink. +max_param_change=2.0 # max param change per minibatch +num_chunk_per_minibatch=100 # number of sequences to be processed in parallel every mini-batch + +samples_per_iter=20000 # this is really the number of egs in each archive. Each eg has + # 'chunk_width' frames in it-- for chunk_width=20, this value (20k) + # is equivalent to the 400k number that we use as a default in + # regular DNN training. +momentum=0.5 # e.g. 0.5. Note: we implemented it in such a way that + # it doesn't increase the effective learning rate. use_gpu=true # if true, we run on GPU. -num_threads=16 # if using CPU, the number of threads we use. cleanup=true egs_dir= max_lda_jobs=10 # use no more than 10 jobs for the LDA accumulation. lda_opts= egs_opts= transform_dir= # If supplied, this dir used instead of alidir to find transforms. -cmvn_opts= # will be passed to get_lda.sh and get_egs.sh, if supplied. +cmvn_opts= # will be passed to get_lda.sh and get_egs.sh, if supplied. # only relevant for "raw" features, not lda. feat_type=raw # or set to 'lda' to use LDA features. align_cmd= # The cmd that is passed to steps/nnet2/align.sh align_use_gpu= # Passed to use_gpu in steps/nnet2/align.sh [yes/no] -realign_times= # List of times on which we realign. Each time is +realign_times= # List of times on which we realign. Each time is # floating point number strictly between 0 and 1, which # will be multiplied by the num-iters to get an iteration # number. num_jobs_align=30 # Number of jobs for realignment + +rand_prune=4.0 # speeds up LDA. + # End configuration section. -frames_per_eg=8 # to be passed on to get_egs.sh trap 'for pid in $(jobs -pr); do kill -KILL $pid; done' INT QUIT TERM @@ -102,14 +128,13 @@ if [ $# != 4 ]; then echo "Main options (for others, see top of script file)" echo " --config # config file containing options" echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs." - echo " --num-epochs <#epochs|15> # Number of epochs of training" - echo " --initial-effective-lrate # effective learning rate at start of training." - echo " --final-effective-lrate # effective learning rate at end of training." + echo " --num-epochs <#epochs|10> # Number of epochs of training" + echo " --initial-effective-lrate # effective learning rate at start of training." + echo " --final-effective-lrate # effective learning rate at end of training." echo " # data, 0.00025 for large data" - echo " --num-hidden-layers <#hidden-layers|2> # Number of hidden layers, e.g. 2 for 3 hours of data, 4 for 100hrs" - echo " --add-layers-period <#iters|2> # Number of iterations between adding hidden layers" - echo " --presoftmax-prior-scale-power # use the specified power value on the priors (inverse priors) to scale" - echo " # the pre-softmax outputs (set to 0.0 to disable the presoftmax element scale)" + echo " --momentum # Momentum constant: note, this is " + echo " # implemented in such a way that it doesn't" + echo " # increase the effective learning rate." echo " --num-jobs-initial # Number of parallel jobs to use for neural net training, at the start." echo " --num-jobs-final # Number of parallel jobs to use for neural net training, at the end" echo " --num-threads # Number of parallel threads per job, for CPU-based training (will affect" @@ -118,24 +143,13 @@ if [ $# != 4 ]; then echo " --parallel-opts # extra options to pass to e.g. queue.pl for processes that" echo " # use multiple threads... note, you might have to reduce mem_free,ram_free" echo " # versus your defaults, because it gets multiplied by the -pe smp argument." - echo " --io-opts # Options given to e.g. queue.pl for jobs that do a lot of I/O." - echo " --minibatch-size # Size of minibatch to process (note: product with --num-threads" - echo " # should not get too large, e.g. >2k)." - echo " --samples-per-iter <#samples|400000> # Number of samples of data to process per iteration, per" - echo " # process." - echo " --splice-indexes " + echo " --splice-indexes " echo " # Frame indices used for each splice layer." echo " # Format : .... " + echo " # the number of fields determines the number of LSTM and non-recurrent layers" + echo " # also see the --num-lstm-layers option" echo " # (note: we splice processed, typically 40-dimensional frames" echo " --lda-dim # Dimension to reduce spliced features to with LDA" - echo " ################### LSTM options ###################### " - echo " --num-lstm-layers # number of LSTM layers" - echo " --lstm-cell-dim # dimension of the LSTM cell" - echo " --hidden-dim # the dimension of the fully connected hidden layer outputs" - echo " --recurrent-projection-dim # the output dimension of the recurrent-projection-matrix" - echo " --non-recurrent-projection-dim # the output dimension of the non-recurrent-projection-matrix" - echo " --bptt-truncation-width # number of BPTT steps" - echo " --context-sensitive-chunk-width # number of steps used in the estimation of the first LSTM state" echo " --realign-epochs # A list of space-separated epoch indices the beginning of which" echo " # realignment is to be done" echo " --align-cmd (utils/run.pl|utils/queue.pl ) # passed to align.sh" @@ -144,7 +158,39 @@ if [ $# != 4 ]; then echo " --stage # Used to run a partially-completed training process from somewhere in" echo " # the middle." - + echo " ################### LSTM options ###################### " + echo " --num-lstm-layers # number of LSTM layers" + echo " --cell-dim # dimension of the LSTM cell" + echo " --hidden-dim # the dimension of the fully connected hidden layer outputs" + echo " --recurrent-projection-dim # the output dimension of the recurrent-projection-matrix" + echo " --non-recurrent-projection-dim # the output dimension of the non-recurrent-projection-matrix" + echo " --chunk-left-context # number of time-steps used in the estimation of the first LSTM state" + echo " --chunk-width # number of output labels in the sequence used to train an LSTM" + echo " # Caution: if you double this you should halve --samples-per-iter." + echo " --norm-based-clipping # if true norm_based_clipping is used." + echo " # In norm-based clipping the activation Jacobian matrix" + echo " # for the recurrent connections in the network is clipped" + echo " # to ensure that the individual row-norm (l2) does not increase" + echo " # beyond the clipping_threshold." + echo " # If false, element-wise clipping is used." + echo " --num-bptt-steps # this variable counts the number of time steps to back-propagate from the last label in the chunk" + echo " # it defaults to chunk_width" + echo " --label-delay # the lstm output is used to predict the label with the specified delay" + + echo " --lstm-delay # the delay to be used in the recurrence of lstms" + echo " # \"-1 -2 -3\" means the a three layer stacked LSTM would use recurrence connections with " + echo " # delays -1, -2 and -3 at layer1 lstm, layer2 lstm and layer3 lstm respectively" + echo " --clipping-threshold # if norm_based_clipping is true this would be the maximum value of the row l2-norm," + echo " # else this is the max-absolute value of each element in Jacobian." + + echo " ################### LSTM specific training options ###################### " + echo " --num-chunks-per-minibatch # Number of sequences to be processed in parallel in a minibatch" + echo " --samples-per-iter <#samples|20000> # Number of egs in each archive of data. This times --chunk-width is" + echo " # the number of frames processed per iteration" + echo " --shrink # if non-zero this parameter will be used to scale the parameter matrices" + echo " --shrink-threshold # a threshold (should be between 0.0 and 0.25) that controls when to" + echo " # do parameter shrinking." + echo " for more options see the script" exit 1; fi @@ -202,8 +248,14 @@ if [ $stage -le -5 ]; then echo "$0: creating neural net configs"; # create the config files for nnet initialization - python steps/nnet3/lstm/make_configs.py \ - --splice-indexes "$splice_indexes" \ + # note an additional space is added to splice_indexes to + # avoid issues with the python ArgParser which can have + # issues with negative arguments (due to minus sign) + config_extra_opts=() + [ ! -z "$lstm_delay" ] && config_extra_opts+=(--lstm-delay "$lstm_delay") + + steps/nnet3/lstm/make_configs.py "${config_extra_opts[@]}" \ + --splice-indexes "$splice_indexes " \ --num-lstm-layers $num_lstm_layers \ --feat-dim $feat_dim \ --ivector-dim $ivector_dim \ @@ -211,9 +263,10 @@ if [ $stage -le -5 ]; then --hidden-dim $hidden_dim \ --recurrent-projection-dim $recurrent_projection_dim \ --non-recurrent-projection-dim $non_recurrent_projection_dim \ - --bptt-truncation-width $bptt_truncation_width \ - --context-sensitive-chunk-width $context_sensitive_chunk_width \ + --norm-based-clipping $norm_based_clipping \ + --clipping-threshold $clipping_threshold \ --num-targets $num_leaves \ + --label-delay $label_delay \ $dir/configs || exit 1; # Initialize as "raw" nnet, prior to training the LDA-like preconditioning # matrix. This first config just does any initial splicing that we do; @@ -222,13 +275,13 @@ if [ $stage -le -5 ]; then $cmd $dir/log/nnet_init.log \ nnet3-init --srand=-2 $dir/configs/init.config $dir/init.raw || exit 1; fi - # sourcing the "vars" below sets -# left_context=(something) -# right_context=(something) +# model_left_context=(something) +# model_right_context=(something) # num_hidden_layers=(something) . $dir/configs/vars || exit 1; - +left_context=$((chunk_left_context + model_left_context)) +right_context=$((chunk_right_context + model_right_context)) context_opts="--left-context=$left_context --right-context=$right_context" ! [ "$num_hidden_layers" -gt 0 ] && echo \ @@ -236,7 +289,6 @@ context_opts="--left-context=$left_context --right-context=$right_context" [ -z "$transform_dir" ] && transform_dir=$alidir - if [ $stage -le -4 ] && [ -z "$egs_dir" ]; then extra_opts=() [ ! -z "$cmvn_opts" ] && extra_opts+=(--cmvn-opts "$cmvn_opts") @@ -245,26 +297,30 @@ if [ $stage -le -4 ] && [ -z "$egs_dir" ]; then extra_opts+=(--transform-dir $transform_dir) extra_opts+=(--left-context $left_context) extra_opts+=(--right-context $right_context) + extra_opts+=(--valid-left-context $((chunk_width + left_context))) + extra_opts+=(--valid-right-context $((chunk_width + right_context))) + + # Note: in RNNs we process sequences of labels rather than single label per sample echo "$0: calling get_egs.sh" steps/nnet3/get_egs.sh $egs_opts "${extra_opts[@]}" \ - --samples-per-iter $samples_per_iter --stage $get_egs_stage \ - --io-opts "$io_opts" \ --cmd "$cmd" $egs_opts \ - --frames-per-eg $frames_per_eg \ + --stage $get_egs_stage \ + --samples-per-iter $samples_per_iter \ + --frames-per-eg $chunk_width \ $data $alidir $dir/egs || exit 1; fi -if [ "$feat_dim" != "$(cat $dir/egs/info/feat_dim)" ]; then - echo "$0: feature dimension mismatch with egs, $feat_dim vs $(cat $dir/egs/info/feat_dim)"; +[ -z $egs_dir ] && egs_dir=$dir/egs + +if [ "$feat_dim" != "$(cat $egs_dir/info/feat_dim)" ]; then + echo "$0: feature dimension mismatch with egs, $feat_dim vs $(cat $egs_dir/info/feat_dim)"; exit 1; fi -if [ "$ivector_dim" != "$(cat $dir/egs/info/ivector_dim)" ]; then - echo "$0: ivector dimension mismatch with egs, $ivector_dim vs $(cat $dir/egs/info/ivector_dim)"; +if [ "$ivector_dim" != "$(cat $egs_dir/info/ivector_dim)" ]; then + echo "$0: ivector dimension mismatch with egs, $ivector_dim vs $(cat $egs_dir/info/ivector_dim)"; exit 1; fi -[ -z $egs_dir ] && egs_dir=$dir/egs - # copy any of the following that exist, to $dir. cp $egs_dir/{cmvn_opts,splice_opts,final.mat} $dir 2>/dev/null @@ -272,22 +328,18 @@ cp $egs_dir/{cmvn_opts,splice_opts,final.mat} $dir 2>/dev/null # the --egs-dir option was used on the command line). egs_left_context=$(cat $egs_dir/info/left_context) || exit -1 egs_right_context=$(cat $egs_dir/info/right_context) || exit -1 -( ! [ $(cat $egs_dir/info/left_context) -le $left_context ] || - ! [ $(cat $egs_dir/info/right_context) -le $right_context ] ) && \ + ( [ $egs_left_context -lt $left_context ] || \ + [ $egs_right_context -lt $right_context ] ) && \ echo "$0: egs in $egs_dir have too little context" && exit -1; -frames_per_eg=$(cat $egs_dir/info/frames_per_eg) || { echo "error: no such file $egs_dir/info/frames_per_eg"; exit 1; } -num_archives=$(cat $egs_dir/info/num_archives) || { echo "error: no such file $egs_dir/info/frames_per_eg"; exit 1; } - -# num_archives_expanded considers each separate label-position from -# 0..frames_per_eg-1 to be a separate archive. -num_archives_expanded=$[$num_archives*$frames_per_eg] +chunk_width=$(cat $egs_dir/info/frames_per_eg) || { echo "error: no such file $egs_dir/info/frames_per_eg"; exit 1; } +num_archives=$(cat $egs_dir/info/num_archives) || { echo "error: no such file $egs_dir/info/num_archives"; exit 1; } [ $num_jobs_initial -gt $num_jobs_final ] && \ echo "$0: --initial-num-jobs cannot exceed --final-num-jobs" && exit 1; -[ $num_jobs_final -gt $num_archives_expanded ] && \ - echo "$0: --final-num-jobs cannot exceed #archives $num_archives_expanded." && exit 1; +[ $num_jobs_final -gt $num_archives ] && \ + echo "$0: --final-num-jobs cannot exceed #archives $num_archives." && exit 1; if [ $stage -le -3 ]; then @@ -320,14 +372,14 @@ if [ $stage -le -2 ]; then echo "$0: preparing initial vector for FixedScaleComponent before softmax" echo " ... using priors^$presoftmax_prior_scale_power and rescaling to average 1" - # obtains raw pdf count + # obtains raw pdf count $cmd JOB=1:$nj $dir/log/acc_pdf.JOB.log \ ali-to-post "ark:gunzip -c $alidir/ali.JOB.gz|" ark:- \| \ post-to-tacc --per-pdf=true $alidir/final.mdl ark:- $dir/pdf_counts.JOB || exit 1; $cmd $dir/log/sum_pdf_counts.log \ vector-sum --binary=false $dir/pdf_counts.* $dir/pdf_counts || exit 1; rm $dir/pdf_counts.* - + awk -v power=$presoftmax_prior_scale_power -v smooth=0.01 \ '{ for(i=2; i<=NF-1; i++) { count[i-2] = $i; total += $i; } num_pdfs=NF-2; average_count = total/num_pdfs; @@ -351,10 +403,10 @@ fi # set num_iters so that as close as possible, we process the data $num_epochs -# times, i.e. $num_iters*$avg_num_jobs) == $num_epochs*$num_archives_expanded, +# times, i.e. $num_iters*$avg_num_jobs) == $num_epochs*$num_archives, # where avg_num_jobs=(num_jobs_initial+num_jobs_final)/2. -num_archives_to_process=$[$num_epochs*$num_archives_expanded] +num_archives_to_process=$[$num_epochs*$num_archives] num_archives_processed=0 num_iters=$[($num_archives_to_process*2)/($num_jobs_initial+$num_jobs_final)] @@ -379,22 +431,16 @@ if $use_gpu; then exit 1 fi else - if [ $num_threads -gt 1 ]; then - parallel_suffix="-parallel" - parallel_train_opts="--num-threads=$num_threads" - train_queue_opt="--num-threads $num_threads" - combine_queue_opt="" # the combine stage will be quite slow if not using - # GPU, as we didn't enable that program to use - # multiple threads. - else - parallel_suffix="" - fi + echo "$0: without using a GPU this will be very slow. nnet3 does not yet support multiple threads." + parallel_train_opts="--use-gpu=no" + combine_queue_opt="" # the combine stage will be quite slow if not using + # GPU, as we didn't enable that program to use + # multiple threads. prior_gpu_opt="--use-gpu=no" prior_queue_opt="" fi - -approx_iters_per_epoch_final=$[$num_archives_expanded/$num_jobs_final] +approx_iters_per_epoch_final=$[$num_archives/$num_jobs_final] # First work out how many iterations we want to combine over in the final # nnet3-combine-fast invocation. (We may end up subsampling from these if the # number exceeds max_model_combine). The number we use is: @@ -423,16 +469,16 @@ for realign_time in $realign_times; do done cur_egs_dir=$egs_dir - +[ -z $num_bptt_steps ] && num_bptt_steps=$chunk_width; +min_deriv_time=$((chunk_width - num_bptt_steps)) while [ $x -lt $num_iters ]; do [ $x -eq $exit_stage ] && echo "$0: Exiting early due to --exit-stage $exit_stage" && exit 0; this_num_jobs=$(perl -e "print int(0.5+$num_jobs_initial+($num_jobs_final-$num_jobs_initial)*$x/$num_iters);") ilr=$initial_effective_lrate; flr=$final_effective_lrate; np=$num_archives_processed; nt=$num_archives_to_process; - this_learning_rate=$(perl -e "print (($x + 1 >= $num_iters ? $flr : $ilr*exp($np*log($flr/$ilr)/$nt))*$this_num_jobs);"); - - echo "On iteration $x, learning rate is $this_learning_rate." + this_effective_learning_rate=$(perl -e "print ($x + 1 >= $num_iters ? $flr : $ilr*exp($np*log($flr/$ilr)/$nt));"); + this_learning_rate=$(perl -e "print ($this_effective_learning_rate*$this_num_jobs);"); if [ ! -z "${realign_this_iter[$x]}" ]; then prev_egs_dir=$cur_egs_dir @@ -440,6 +486,15 @@ while [ $x -lt $num_iters ]; do fi if [ $x -ge 0 ] && [ $stage -le $x ]; then + # Set this_shrink value. + if [ $x -eq 0 ] || nnet3-am-info --print-args=false $dir/$x.mdl | \ + perl -e "while(<>){ if (m/type=Sigmoid.+deriv-avg=.+mean=(\S+)/) { \$n++; \$tot+=\$1; } } exit(\$tot/\$n > $shrink_threshold);"; then + this_shrink=$shrink; # e.g. avg-deriv of sigmoids was <= 0.125, so shrink. + else + this_shrink=1.0 # don't shrink: sigmoids are not over-saturated. + fi + echo "On iteration $x, learning rate is $this_learning_rate and shrink value is $this_shrink." + if [ ! -z "${realign_this_iter[$x]}" ]; then time=${realign_this_iter[$x]} @@ -478,23 +533,22 @@ while [ $x -lt $num_iters ]; do steps/nnet3/remove_egs.sh $prev_egs_dir fi fi - + # Set off jobs doing some diagnostics, in the background. # Use the egs dir from the previous iteration for the diagnostics $cmd $dir/log/compute_prob_valid.$x.log \ nnet3-compute-prob "nnet3-am-copy --raw=true $dir/$x.mdl - |" \ - "ark:nnet3-merge-egs ark:$cur_egs_dir/valid_diagnostic.egs ark:- |" & + "ark,bg:nnet3-merge-egs ark:$cur_egs_dir/valid_diagnostic.egs ark:- |" & $cmd $dir/log/compute_prob_train.$x.log \ nnet3-compute-prob "nnet3-am-copy --raw=true $dir/$x.mdl - |" \ - "ark:nnet3-merge-egs ark:$cur_egs_dir/train_diagnostic.egs ark:- |" & + "ark,bg:nnet3-merge-egs ark:$cur_egs_dir/train_diagnostic.egs ark:- |" & - # nnet3-show-progress not implemented yet - #if [ $x -gt 0 ] && [ ! -f $dir/log/mix_up.$[$x-1].log ]; then - # $cmd $dir/log/progress.$x.log \ - # nnet3-show-progress --use-gpu=no $dir/$[$x-1].mdl $dir/$x.mdl \ - # ark:$cur_egs_dir/train_diagnostic.egs '&&' \ - # nnet3-info $dir/$x.mdl & - #fi + if [ $x -gt 0 ]; then + $cmd $dir/log/progress.$x.log \ + nnet3-info "nnet3-am-copy --raw=true $dir/$x.mdl - |" '&&' \ + nnet3-show-progress --use-gpu=no "nnet3-am-copy --raw=true $dir/$[$x-1].mdl - |" "nnet3-am-copy --raw=true $dir/$x.mdl - |" \ + "ark,bg:nnet3-merge-egs --minibatch-size=256 ark:$cur_egs_dir/train_diagnostic.egs ark:-|" & + fi echo "Training neural net (pass $x)" @@ -506,20 +560,23 @@ while [ $x -lt $num_iters ]; do cur_num_hidden_layers=$[1+$x/$add_layers_period] config=$dir/configs/layer$cur_num_hidden_layers.config raw="nnet3-am-copy --raw=true --learning-rate=$this_learning_rate $dir/$x.mdl - | nnet3-init --srand=$x - $config - |" + cache_read_opt="" # an option for writing cache (storing pairs of nnet-computations + # and computation-requests) during training. else do_average=true if [ $x -eq 0 ]; then do_average=false; fi # on iteration 0, pick the best, don't average. raw="nnet3-am-copy --raw=true --learning-rate=$this_learning_rate $dir/$x.mdl -|" + cache_read_opt="--read-cache=$dir/cache.$x" fi if $do_average; then - this_minibatch_size=$minibatch_size + this_num_chunk_per_minibatch=$num_chunk_per_minibatch else # on iteration zero or when we just added a layer, use a smaller minibatch # size (and we will later choose the output of just one of the jobs): the # model-averaging isn't always helpful when the model is changing too fast # (i.e. it can worsen the objective function), and the smaller minibatch # size will help to keep the update stable. - this_minibatch_size=$[$minibatch_size/2]; + this_num_chunk_per_minibatch=$[$num_chunk_per_minibatch/2]; fi rm $dir/.error 2>/dev/null @@ -528,22 +585,29 @@ while [ $x -lt $num_iters ]; do ( # this sub-shell is so that when we "wait" below, # we only wait for the training jobs that we just spawned, # not the diagnostic jobs that we spawned above. - - # We can't easily use a single parallel SGE job to do the main training, + + # We cannot easily use a single parallel SGE job to do the main training, # because the computation of which archive and which --frame option # to use for each job is a little complex, so we spawn each one separately. + # this is no longer true for RNNs as we use do not use the --frame option + # but we use the same script for consistency with FF-DNN code + for n in $(seq $this_num_jobs); do - k=$[$num_archives_processed + $n - 1]; # k is a zero-based index that we'll derive + k=$[$num_archives_processed + $n - 1]; # k is a zero-based index that we will derive # the other indexes from. archive=$[($k%$num_archives)+1]; # work out the 1-based archive index. - frame=$[(($k/$num_archives)%$frames_per_eg)]; # work out the 0-based frame - # index; this increases more slowly than the archive index because the - # same archive with different frame indexes will give similar gradients, - # so we want to separate them in time. - + if [ $n -eq 1 ]; then + # an option for writing cache (storing pairs of nnet-computations and + # computation-requests) during training. + cache_write_opt=" --write-cache=$dir/cache.$[$x+1]" + else + cache_write_opt="" + fi $cmd $train_queue_opt $dir/log/train.$x.$n.log \ - nnet3-train$parallel_suffix $parallel_train_opts "$raw" \ - "ark:nnet3-copy-egs --frame=$frame $context_opts ark:$cur_egs_dir/egs.$archive.ark ark:- | nnet3-shuffle-egs --buffer-size=$shuffle_buffer_size --srand=$x ark:- ark:-| nnet3-merge-egs --minibatch-size=$this_minibatch_size ark:- ark:- |" \ + nnet3-train $parallel_train_opts $cache_read_opt $cache_write_opt --print-interval=10 --momentum=$momentum \ + --max-param-change=$max_param_change \ + --optimization.min-deriv-time=$min_deriv_time "$raw" \ + "ark,bg:nnet3-copy-egs $context_opts ark:$cur_egs_dir/egs.$archive.ark ark:- | nnet3-shuffle-egs --buffer-size=$shuffle_buffer_size --srand=$x ark:- ark:-| nnet3-merge-egs --minibatch-size=$this_num_chunk_per_minibatch --measure-output-frames=false --discard-partial-minibatches=true ark:- ark:- |" \ $dir/$[$x+1].$n.raw || touch $dir/.error & done wait @@ -552,8 +616,9 @@ while [ $x -lt $num_iters ]; do # have printed a more specific one. [ -f $dir/.error ] && echo "$0: error on iteration $x of training" && exit 1; + models_to_average=$(steps/nnet3/get_successful_models.py $this_num_jobs $dir/log/train.$x.%.log) nnets_list= - for n in `seq 1 $this_num_jobs`; do + for n in $models_to_average; do nnets_list="$nnets_list $dir/$[$x+1].$n.raw" done @@ -561,19 +626,24 @@ while [ $x -lt $num_iters ]; do # average the output of the different jobs. $cmd $dir/log/average.$x.log \ nnet3-average $nnets_list - \| \ - nnet3-am-copy --set-raw-nnet=- $dir/$x.mdl $dir/$[$x+1].mdl || exit 1; + nnet3-am-copy --scale=$this_shrink --set-raw-nnet=- $dir/$x.mdl $dir/$[$x+1].mdl || exit 1; else # choose the best from the different jobs. n=$(perl -e '($nj,$pat)=@ARGV; $best_n=1; $best_logprob=-1.0e+10; for ($n=1;$n<=$nj;$n++) { $fn = sprintf($pat,$n); open(F, "<$fn") || die "Error opening log file $fn"; undef $logprob; while () { if (m/log-prob-per-frame=(\S+)/) { $logprob=$1; } } - close(F); if (defined $logprob && $logprob > $best_logprob) { $best_logprob=$logprob; - $best_n=$n; } } print "$best_n\n"; ' $num_jobs_nnet $dir/log/train.$x.%d.log) || exit 1; + close(F); if (defined $logprob && $logprob > $best_logprob) { $best_logprob=$logprob; + $best_n=$n; } } print "$best_n\n"; ' $this_num_jobs $dir/log/train.$x.%d.log) || exit 1; [ -z "$n" ] && echo "Error getting best model" && exit 1; $cmd $dir/log/select.$x.log \ - nnet3-am-copy --set-raw-nnet=$dir/$[$x+1].$n.raw $dir/$x.mdl $dir/$[$x+1].mdl || exit 1; + nnet3-am-copy --scale=$this_shrink --set-raw-nnet=$dir/$[$x+1].$n.raw $dir/$x.mdl $dir/$[$x+1].mdl || exit 1; fi + nnets_list= + for n in `seq 1 $this_num_jobs`; do + nnets_list="$nnets_list $dir/$[$x+1].$n.raw" + done + rm $nnets_list [ ! -f $dir/$[$x+1].mdl ] && exit 1; if [ -f $dir/$[$x-1].mdl ] && $cleanup && \ @@ -581,6 +651,7 @@ while [ $x -lt $num_iters ]; do rm $dir/$[$x-1].mdl fi fi + rm $dir/cache.$x 2>/dev/null x=$[$x+1] num_archives_processed=$[$num_archives_processed+$this_num_jobs] done @@ -601,14 +672,11 @@ if [ $stage -le $num_iters ]; then nnets_list[$n]="nnet3-am-copy --raw=true $mdl -|"; done - # Below, we use --use-gpu=no to disable nnet3-combine-fast from using a GPU, - # as if there are many models it can give out-of-memory error; and we set - # num-threads to 8 to speed it up (this isn't ideal...) - + combine_num_chunk_per_minibatch=$(python -c "print int(1024.0/($chunk_width))") $cmd $combine_queue_opt $dir/log/combine.log \ nnet3-combine --num-iters=40 \ --enforce-sum-to-one=true --enforce-positive-weights=true \ - --verbose=3 "${nnets_list[@]}" "ark:nnet3-merge-egs --minibatch-size=1024 ark:$cur_egs_dir/combine.egs ark:-|" \ + --verbose=3 "${nnets_list[@]}" "ark,bg:nnet3-merge-egs --measure-output-frames=false --minibatch-size=$combine_num_chunk_per_minibatch ark:$cur_egs_dir/combine.egs ark:-|" \ "|nnet3-am-copy --set-raw-nnet=- $dir/$num_iters.mdl $dir/combined.mdl" || exit 1; # Compute the probability of the final, combined model with @@ -616,20 +684,21 @@ if [ $stage -le $num_iters ]; then # different subsets will lead to different probs. $cmd $dir/log/compute_prob_valid.final.log \ nnet3-compute-prob "nnet3-am-copy --raw=true $dir/combined.mdl -|" \ - "ark:nnet3-merge-egs ark:$cur_egs_dir/valid_diagnostic.egs ark:- |" & + "ark,bg:nnet3-merge-egs --minibatch-size=256 ark:$cur_egs_dir/valid_diagnostic.egs ark:- |" & $cmd $dir/log/compute_prob_train.final.log \ nnet3-compute-prob "nnet3-am-copy --raw=true $dir/combined.mdl -|" \ - "ark:nnet3-merge-egs ark:$cur_egs_dir/train_diagnostic.egs ark:- |" & + "ark,bg:nnet3-merge-egs --minibatch-size=256 ark:$cur_egs_dir/train_diagnostic.egs ark:- |" & fi if [ $stage -le $[$num_iters+1] ]; then echo "Getting average posterior for purposes of adjusting the priors." # Note: this just uses CPUs, using a smallish subset of data. rm $dir/post.$x.*.vec 2>/dev/null + if [ $num_jobs_compute_prior -gt $num_archives ]; then egs_part=1; + else egs_part=JOB; fi $cmd JOB=1:$num_jobs_compute_prior $prior_queue_opt $dir/log/get_post.$x.JOB.log \ - nnet3-copy-egs --frame=random $context_opts --srand=JOB ark:$cur_egs_dir/egs.1.ark ark:- \| \ - nnet3-subset-egs --srand=JOB --n=$prior_subset_size ark:- ark:- \| \ - nnet3-merge-egs ark:- ark:- \| \ + nnet3-subset-egs --srand=JOB --n=$prior_subset_size ark:$cur_egs_dir/egs.$egs_part.ark ark:- \| \ + nnet3-merge-egs --measure-output-frames=true --minibatch-size=128 ark:- ark:- \| \ nnet3-compute-from-egs $prior_gpu_opt --apply-exp=true \ "nnet3-am-copy --raw=true $dir/combined.mdl -|" ark:- ark:- \| \ matrix-sum-rows ark:- ark:- \| vector-sum ark:- $dir/post.$x.JOB.vec || exit 1; @@ -667,7 +736,7 @@ if $cleanup; then for x in `seq 0 $num_iters`; do if [ $[$x%100] -ne 0 ] && [ $x -ne $num_iters ] && [ -f $dir/$x.mdl ]; then # delete all but every 100th model; don't delete the ones which combine to form the final model. - rm $dir/$x.mdl + rm $dir/$x.mdl fi done fi diff --git a/egs/wsj/s5/steps/nnet3/make_denlats.sh b/egs/wsj/s5/steps/nnet3/make_denlats.sh new file mode 100755 index 00000000000..fadc164c539 --- /dev/null +++ b/egs/wsj/s5/steps/nnet3/make_denlats.sh @@ -0,0 +1,251 @@ +#!/bin/bash +# Copyright 2012 Johns Hopkins University (Author: Daniel Povey) +# 2014-2015 Vimal Manohar +# Apache 2.0. + +# Create denominator lattices for MMI/MPE training. +# This version uses the neural-net models (version 3, i.e. the nnet3 code). +# Creates its output in $dir/lat.*.gz + +# Begin configuration section. +nj=4 +cmd=run.pl +sub_split=1 +beam=13.0 +frames_per_chunk=50 +lattice_beam=7.0 +self_loop_scale=0.1 +acwt=0.1 +max_active=5000 +min_active=200 +transform_dir= +max_mem=20000000 # This will stop the processes getting too large. +# This is in bytes, but not "real" bytes-- you have to multiply +# by something like 5 or 10 to get real bytes (not sure why so large) +num_threads=1 # Fixed to 1 for now +online_ivector_dir= +determinize=true +minimize=false +ivector_scale=1.0 +extra_left_context=0 +extra_right_context=0 +extra_left_context_initial=-1 +extra_right_context_final=-1 +feat_type= # you can set this in order to run on top of delta features, although we don't + # normally want to do this. +# End configuration section. + + +echo "$0 $@" # Print the command line for logging + +[ -f ./path.sh ] && . ./path.sh; # source the path. +. parse_options.sh || exit 1; + +num_threads=1 # Fixed to 1 for now + +if [ $# != 4 ]; then + echo "Usage: steps/nnet3/make_denlats.sh [options] " + echo " e.g.: steps/nnet3/make_denlats.sh data/train data/lang exp/nnet4 exp/nnet4_denlats" + echo "Works for (delta|lda) features, and (with --transform-dir option) such features" + echo " plus transforms." + echo "" + echo "Main options (for others, see top of script file)" + echo " --config # config containing options" + echo " --nj # number of parallel jobs" + echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs." + echo " --sub-split # e.g. 40; use this for " + echo " # large databases so your jobs will be smaller and" + echo " # will (individually) finish reasonably soon." + echo " --transform-dir # directory to find fMLLR transforms." + echo " --num-threads # number of threads per decoding job" + exit 1; +fi + +data=$1 +lang=$2 +srcdir=$3 +dir=$4 + + +extra_files= +[ ! -z "$online_ivector_dir" ] && \ + extra_files="$online_ivector_dir/ivector_online.scp $online_ivector_dir/ivector_period" +for f in $data/feats.scp $lang/L.fst $srcdir/final.mdl $extra_files; do + [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1; +done + +sdata=$data/split$nj +splice_opts=`cat $srcdir/splice_opts 2>/dev/null` +thread_string= +[ $num_threads -gt 1 ] && thread_string="-parallel --num-threads=$num_threads" + +mkdir -p $dir/log +split_data.sh $data $nj || exit 1; +echo $nj > $dir/num_jobs + +oov=`cat $lang/oov.int` || exit 1; + +cp -rH $lang $dir/ + +# Compute grammar FST which corresponds to unigram decoding graph. +new_lang="$dir/"$(basename "$lang") + +# mkgraph.sh expects a whole directory "lang", so put everything in one directory... +# it gets L_disambig.fst and G.fst (among other things) from $dir/lang, and +# final.mdl from $srcdir; the output HCLG.fst goes in $dir/graph. + +echo "Compiling decoding graph in $dir/dengraph" +if [ -s $dir/dengraph/HCLG.fst ] && [ $dir/dengraph/HCLG.fst -nt $srcdir/final.mdl ]; then + echo "Graph $dir/dengraph/HCLG.fst already exists: skipping graph creation." +else + echo "Making unigram grammar FST in $new_lang" + cat $data/text | utils/sym2int.pl --map-oov $oov -f 2- $lang/words.txt | \ + awk '{for(n=2;n<=NF;n++){ printf("%s ", $n); } printf("\n"); }' | \ + utils/make_unigram_grammar.pl | fstcompile | fstarcsort --sort_type=ilabel > $new_lang/G.fst \ + || exit 1; + utils/mkgraph.sh --self-loop-scale $self_loop_scale $new_lang $srcdir $dir/dengraph || exit 1; +fi +cmvn_opts=`cat $srcdir/cmvn_opts 2>/dev/null` +cp $srcdir/cmvn_opts $dir 2>/dev/null + +if [ -z "$feat_type" ]; then + if [ -f $srcdir/final.mat ]; then feat_type=lda; else feat_type=raw; fi +fi +echo "$0: feature type is $feat_type" + +case $feat_type in + delta) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";; + raw) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- |" + ;; + lda) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |" + cp $srcdir/final.mat $dir + ;; + *) echo "Invalid feature type $feat_type" && exit 1; +esac + +if [ ! -z "$transform_dir" ]; then + echo "$0: using transforms from $transform_dir" + [ ! -s $transform_dir/num_jobs ] && \ + echo "$0: expected $transform_dir/num_jobs to contain the number of jobs." && exit 1; + nj_orig=$(cat $transform_dir/num_jobs) + + if [ $feat_type == "raw" ]; then trans=raw_trans; + else trans=trans; fi + if [ $feat_type == "lda" ] && ! cmp $transform_dir/final.mat $srcdir/final.mat; then + echo "$0: LDA transforms differ between $srcdir and $transform_dir" + exit 1; + fi + if [ ! -f $transform_dir/$trans.1 ]; then + echo "$0: expected $transform_dir/$trans.1 to exist (--transform-dir option)" + exit 1; + fi + if [ $nj -ne $nj_orig ]; then + # Copy the transforms into an archive with an index. + for n in $(seq $nj_orig); do cat $transform_dir/$trans.$n; done | \ + copy-feats ark:- ark,scp:$dir/$trans.ark,$dir/$trans.scp || exit 1; + feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk scp:$dir/$trans.scp ark:- ark:- |" + else + # number of jobs matches with alignment dir. + feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$transform_dir/$trans.JOB ark:- ark:- |" + fi +fi + + +# if this job is interrupted by the user, we want any background jobs to be +# killed too. +cleanup() { + local pids=$(jobs -pr) + [ -n "$pids" ] && kill $pids +} +trap "cleanup" INT QUIT TERM EXIT + +if [ ! -z "$online_ivector_dir" ]; then + ivector_period=$(cat $online_ivector_dir/ivector_period) || exit 1; + ivector_opts="--online-ivectors=scp:$online_ivector_dir/ivector_online.scp --online-ivector-period=$ivector_period" +fi + +if [ -f $srcdir/frame_subsampling_factor ]; then + # e.g. for 'chain' systems + frame_subsampling_opt="--frame-subsampling-factor=$(cat $srcdir/frame_subsampling_factor)" + cp $srcdir/frame_subsampling_factor $dir +fi + +lattice_determinize_cmd= +if $determinize; then + lattice_determinize_cmd="lattice-determinize-non-compact --acoustic-scale=$acwt --max-mem=$max_mem --minimize=$minimize --prune --beam=$beam ark:- ark:- |" +fi + +if [ $sub_split -eq 1 ]; then + $cmd --num-threads $num_threads JOB=1:$nj $dir/log/decode_den.JOB.log \ + nnet3-latgen-faster$thread_string $ivector_opts $frame_subsampling_opt \ + --frames-per-chunk=$frames_per_chunk \ + --extra-left-context=$extra_left_context \ + --extra-right-context=$extra_right_context \ + --extra-left-context-initial=$extra_left_context_initial \ + --extra-right-context-final=$extra_right_context_final \ + --minimize=false --determinize-lattice=false \ + --word-determinize=false --phone-determinize=false \ + --max-active=$max_active --min-active=$min_active --beam=$beam \ + --lattice-beam=$lattice_beam --acoustic-scale=$acwt --allow-partial=false \ + --max-mem=$max_mem --word-symbol-table=$lang/words.txt $srcdir/final.mdl \ + $dir/dengraph/HCLG.fst "$feats" \ + "ark:|$lattice_determinize_cmd gzip -c >$dir/lat.JOB.gz" || exit 1 +else + + # each job from 1 to $nj is split into multiple pieces (sub-split), and we aim + # to have at most two jobs running at each time. The idea is that if we have stragglers + # from one job, we can be processing another one at the same time. + rm $dir/.error 2>/dev/null + + prev_pid= + for n in `seq $[nj+1]`; do + if [ $n -gt $nj ]; then + this_pid= + elif [ -f $dir/.done.$n ] && [ $dir/.done.$n -nt $srcdir/final.mdl ]; then + echo "Not processing subset $n as already done (delete $dir/.done.$n if not)"; + this_pid= + else + sdata2=$data/split$nj/$n/split$sub_split; + if [ ! -d $sdata2 ] || [ $sdata2 -ot $sdata/$n/feats.scp ]; then + split_data.sh --per-utt $sdata/$n $sub_split || exit 1; + fi + mkdir -p $dir/log/$n + mkdir -p $dir/part + feats_subset=`echo $feats | sed "s/trans.JOB/trans.$n/g" | sed s:JOB/:$n/split$sub_split/JOB/:g` + + $cmd --num-threads $num_threads JOB=1:$sub_split $dir/log/$n/decode_den.JOB.log \ + nnet3-latgen-faster$thread_string $ivector_opts $frame_subsampling_opt \ + --frames-per-chunk=$frames_per_chunk \ + --extra-left-context=$extra_left_context \ + --extra-right-context=$extra_right_context \ + --extra-left-context-initial=$extra_left_context_initial \ + --extra-right-context-final=$extra_right_context_final \ + --minimize=false --determinize-lattice=false \ + --word-determinize=false --phone-determinize=false \ + --max-active=$max_active --min-active=$min_active --beam=$beam \ + --lattice-beam=$lattice_beam --acoustic-scale=$acwt --allow-partial=false \ + --max-mem=$max_mem --word-symbol-table=$lang/words.txt $srcdir/final.mdl \ + $dir/dengraph/HCLG.fst "$feats_subset" \ + "ark:|$lattice_determinize_cmd gzip -c >$dir/lat.$n.JOB.gz" || touch $dir/.error & + this_pid=$! + fi + if [ ! -z "$prev_pid" ]; then # Wait for the previous job; merge the previous set of lattices. + wait $prev_pid + [ -f $dir/.error ] && echo "$0: error generating denominator lattices" && exit 1; + rm $dir/.merge_error 2>/dev/null + echo Merging archives for data subset $prev_n + for k in `seq $sub_split`; do + gunzip -c $dir/lat.$prev_n.$k.gz || touch $dir/.merge_error; + done | gzip -c > $dir/lat.$prev_n.gz || touch $dir/.merge_error; + [ -f $dir/.merge_error ] && echo "$0: Merging lattices for subset $prev_n failed (or maybe some other error)" && exit 1; + rm $dir/lat.$prev_n.*.gz + touch $dir/.done.$prev_n + fi + prev_n=$n + prev_pid=$this_pid + done +fi + + +echo "$0: done generating denominator lattices." + diff --git a/egs/wsj/s5/steps/nnet3/make_jesus_configs.py b/egs/wsj/s5/steps/nnet3/make_jesus_configs.py new file mode 100755 index 00000000000..af6afcb99e3 --- /dev/null +++ b/egs/wsj/s5/steps/nnet3/make_jesus_configs.py @@ -0,0 +1,538 @@ +#!/usr/bin/env python + +# tdnn or RNN with 'jesus layer' + +# inputs to jesus layer: +# - for each spliced version of the previous layer the output (of dim --jesus-forward-output-dim) + +# outputs of jesus layer: +# for all layers: +# --jesus-forward-output-dim + + +# we're using python 3.x style print but want it to work in python 2.x, +from __future__ import print_function +import re, os, argparse, sys, math, warnings +import imp + +nnet3_train_lib = imp.load_source('ntl', 'steps/nnet3/nnet3_train_lib.py') +chain_lib = imp.load_source('ncl', 'steps/nnet3/chain/nnet3_chain_lib.py') + +parser = argparse.ArgumentParser(description="Writes config files and variables " + "for TDNNs creation and training", + epilog="See steps/nnet3/train_tdnn.sh for example."); +parser.add_argument("--splice-indexes", type=str, required = True, + help="Splice[:recurrence] indexes at each hidden layer, e.g. '-3,-2,-1,0,1,2,3 -3,0:-3 -3,0:-3 -6,-3,0:-6,-3'. " + "Note: recurrence indexes are optional, may not appear in 1st layer, and must be " + "either all negative or all positive for any given layer.") + +# Only one of these arguments can be specified, and one of them has to +# be compulsarily specified +feat_group = parser.add_mutually_exclusive_group(required = True) +feat_group.add_argument("--feat-dim", type=int, + help="Raw feature dimension, e.g. 13") +feat_group.add_argument("--feat-dir", type=str, + help="Feature directory, from which we derive the feat-dim") + +# only one of these arguments can be specified +ivector_group = parser.add_mutually_exclusive_group(required = False) +ivector_group.add_argument("--ivector-dim", type=int, + help="iVector dimension, e.g. 100", default=0) +ivector_group.add_argument("--ivector-dir", type=str, + help="iVector dir, which will be used to derive the ivector-dim ", default=None) + +num_target_group = parser.add_mutually_exclusive_group(required = True) +num_target_group.add_argument("--num-targets", type=int, + help="number of network targets (e.g. num-pdf-ids/num-leaves)") +num_target_group.add_argument("--ali-dir", type=str, + help="alignment directory, from which we derive the num-targets") +num_target_group.add_argument("--tree-dir", type=str, + help="directory with final.mdl, from which we derive the num-targets") + +parser.add_argument("--include-log-softmax", type=str, + help="add the final softmax layer ", default="true", choices = ["false", "true"]) +parser.add_argument("--xent-regularize", type=float, + help="For chain models, if nonzero, add a separate output for cross-entropy " + "regularization (with learning-rate-factor equal to the inverse of this)", + default=0.0) +parser.add_argument("--xent-separate-forward-affine", type=str, + help="if using --xent-regularize, gives it separate last-but-one weight matrix", + default="false", choices = ["false", "true"]) +parser.add_argument("--use-repeated-affine", type=str, + help="if true use RepeatedAffineComponent, else BlockAffineComponent (i.e. no sharing)", + default="true", choices = ["false", "true"]) +parser.add_argument("--final-layer-learning-rate-factor", type=float, + help="Learning-rate factor for final affine component", + default=1.0) +parser.add_argument("--self-repair-scale", type=float, + help="Small scale involved in fixing derivatives, if supplied (e.g. try 0.00001)", + default=0.0) +parser.add_argument("--jesus-hidden-dim", type=int, + help="hidden dimension of Jesus layer.", default=10000) +parser.add_argument("--jesus-forward-output-dim", type=int, + help="part of output dimension of Jesus layer that goes to next layer", + default=1000) +parser.add_argument("--jesus-forward-input-dim", type=int, + help="Input dimension of Jesus layer that comes from affine projection " + "from the previous layer (same as output dim of forward affine transform)", + default=1000) +parser.add_argument("--final-hidden-dim", type=int, + help="Final hidden layer dimension-- or if <0, the same as " + "--jesus-forward-input-dim", default=-1) +parser.add_argument("--num-jesus-blocks", type=int, + help="number of blocks in Jesus layer. All configs of the form " + "--jesus-*-dim will be rounded up to be a multiple of this.", + default=100); +parser.add_argument("--jesus-stddev-scale", type=float, + help="Scaling factor on parameter stddev of Jesus layer (smaller->jesus layer learns faster)", + default=1.0) +parser.add_argument("--clipping-threshold", type=float, + help="clipping threshold used in ClipGradient components (only relevant if " + "recurrence indexes are specified). If clipping-threshold=0 no clipping is done", + default=15) +parser.add_argument("config_dir", + help="Directory to write config files and variables"); + +print(' '.join(sys.argv)) + +args = parser.parse_args() + +if not os.path.exists(args.config_dir): + os.makedirs(args.config_dir) + +## Check arguments. +if args.feat_dir is not None: + args.feat_dim = nnet3_train_lib.GetFeatDim(args.feat_dir) + +if args.ali_dir is not None: + args.num_targets = nnet3_train_lib.GetNumberOfLeaves(args.ali_dir) +elif args.tree_dir is not None: + args.num_targets = chain_lib.GetNumberOfLeaves(args.tree_dir) + +if args.ivector_dir is not None: + args.ivector_dim = nnet3_train_lib.GetIvectorDim(args.ivector_dir) + +if not args.feat_dim > 0: + raise Exception("feat-dim has to be postive") + +if not args.num_targets > 0: + print(args.num_targets) + raise Exception("num_targets has to be positive") + +if not args.ivector_dim >= 0: + raise Exception("ivector-dim has to be non-negative") + + +## Check arguments. +if args.num_jesus_blocks < 1: + sys.exit("invalid --num-jesus-blocks value"); +if args.final_hidden_dim < 0: + args.final_hidden_dim = args.jesus_forward_input_dim + +for name in [ "jesus_hidden_dim", "jesus_forward_output_dim", "jesus_forward_input_dim", + "final_hidden_dim" ]: + old_val = getattr(args, name) + if old_val % args.num_jesus_blocks != 0: + new_val = old_val + args.num_jesus_blocks - (old_val % args.num_jesus_blocks) + printable_name = '--' + name.replace('_', '-') + print('Rounding up {0} from {1} to {2} to be a multiple of --num-jesus-blocks={3} '.format( + printable_name, old_val, new_val, args.num_jesus_blocks)) + setattr(args, name, new_val); + +# this is a bit like a struct, initialized from a string, which describes how to +# set up the statistics-pooling and statistics-extraction components. +# An example string is 'mean(-99:3:9::99)', which means, compute the mean of +# data within a window of -99 to +99, with distinct means computed every 9 frames +# (we round to get the appropriate one), and with the input extracted on multiples +# of 3 frames (so this will force the input to this layer to be evaluated +# every 3 frames). Another example string is 'mean+stddev(-99:3:9:99)', +# which will also cause the standard deviation to be computed. +class StatisticsConfig: + # e.g. c = StatisticsConfig('mean+stddev(-99:3:9:99)', 400, 'jesus1-forward-output-affine') + def __init__(self, config_string, input_dim, input_name): + self.input_dim = input_dim + self.input_name = input_name + + m = re.search("(mean|mean\+stddev)\((-?\d+):(-?\d+):(-?\d+):(-?\d+)\)", + config_string) + if m == None: + sys.exit("Invalid splice-index or statistics-config string: " + config_string) + self.output_stddev = (m.group(1) != 'mean') + self.left_context = -int(m.group(2)) + self.input_period = int(m.group(3)) + self.stats_period = int(m.group(4)) + self.right_context = int(m.group(5)) + if not (self.left_context > 0 and self.right_context > 0 and + self.input_period > 0 and self.stats_period > 0 and + self.left_context % self.stats_period == 0 and + self.right_context % self.stats_period == 0 and + self.stats_period % self.input_period == 0): + sys.exit("Invalid configuration of statistics-extraction: " + config_string) + + # OutputDim() returns the output dimension of the node that this produces. + def OutputDim(self): + return self.input_dim * (2 if self.output_stddev else 1) + + # OutputDims() returns an array of output dimensions, consisting of + # [ input-dim ] if just "mean" was specified, otherwise + # [ input-dim input-dim ] + def OutputDims(self): + return [ self.input_dim, self.input_dim ] if self.output_stddev else [ self.input_dim ] + + # Descriptor() returns the textual form of the descriptor by which the + # output of this node is to be accessed. + def Descriptor(self): + return 'Round({0}-pooling-{1}-{2}, {3})'.format(self.input_name, self.left_context, self.right_context, + self.stats_period) + + # This function writes the configuration lines need to compute the specified + # statistics, to the file f. + def WriteConfigs(self, f): + print('component name={0}-extraction-{1}-{2} type=StatisticsExtractionComponent input-dim={3} ' + 'input-period={4} output-period={5} include-variance={6} '.format( + self.input_name, self.left_context, self.right_context, + self.input_dim, self.input_period, self.stats_period, + ('true' if self.output_stddev else 'false')), file=f) + print('component-node name={0}-extraction-{1}-{2} component={0}-extraction-{1}-{2} input={0} '.format( + self.input_name, self.left_context, self.right_context), file=f) + stats_dim = 1 + self.input_dim * (2 if self.output_stddev else 1) + print('component name={0}-pooling-{1}-{2} type=StatisticsPoolingComponent input-dim={3} ' + 'input-period={4} left-context={1} right-context={2} num-log-count-features=0 ' + 'output-stddevs={5} '.format(self.input_name, self.left_context, self.right_context, + stats_dim, self.stats_period, + ('true' if self.output_stddev else 'false')), + file=f) + print('component-node name={0}-pooling-{1}-{2} component={0}-pooling-{1}-{2} input={0}-extraction-{1}-{2} '.format( + self.input_name, self.left_context, self.right_context), file=f) + + + + +## Work out splice_array +## e.g. for +## args.splice_indexes == '-3,-2,-1,0,1,2,3 -3,0:-3 -3,0:-3 -6,-3,0:-6,-3' +## we would have +## splice_array = [ [ -3,-2,...3 ], [-3,0] [-3,0] [-6,-3,0] + + +splice_array = [] +left_context = 0 +right_context = 0 +split_on_spaces = args.splice_indexes.split(" "); # we already checked the string is nonempty. +if len(split_on_spaces) < 2: + sys.exit("invalid --splice-indexes argument, too short: " + + args.splice_indexes) +try: + for string in split_on_spaces: + this_layer = len(splice_array) + + this_splices = string.split(",") + splice_array.append(this_splices) + # the rest of this block updates left_context and right_context, and + # does some checking. + leftmost_splice = 10000 + rightmost_splice = -10000 + for s in this_splices: + try: + n = int(s) + if n < leftmost_splice: + leftmost_splice = n + if n > rightmost_splice: + rightmost_splice = n + except: + if len(splice_array) == 1: + sys.exit("First dimension of splicing array must not have averaging [yet]") + try: + x = StatisticsConfig(s, 100, 'foo') + except: + sys.exit("The following element of the splicing array is not a valid specifier " + "of statistics: " + s) + + if leftmost_splice == 10000 or rightmost_splice == -10000: + sys.exit("invalid element of --splice-indexes: " + string) + left_context += -leftmost_splice + right_context += rightmost_splice +except ValueError as e: + sys.exit("invalid --splice-indexes argument " + args.splice_indexes + " " + str(e)) +left_context = max(0, left_context) +right_context = max(0, right_context) +num_hidden_layers = len(splice_array) +input_dim = len(splice_array[0]) * args.feat_dim + args.ivector_dim + +f = open(args.config_dir + "/vars", "w") +print('left_context=' + str(left_context), file=f) +print('right_context=' + str(right_context), file=f) +print('num_hidden_layers=' + str(num_hidden_layers), file=f) +f.close() + + +f = open(args.config_dir + "/init.config", "w") +print('# Config file for initializing neural network prior to', file=f) +print('# preconditioning matrix computation', file=f) +print('input-node name=input dim=' + str(args.feat_dim), file=f) +list=[ ('Offset(input, {0})'.format(n) if n != 0 else 'input' ) for n in splice_array[0] ] +if args.ivector_dim > 0: + print('input-node name=ivector dim=' + str(args.ivector_dim), file=f) + list.append('ReplaceIndex(ivector, t, 0)') +# example of next line: +# output-node name=output input="Append(Offset(input, -3), Offset(input, -2), Offset(input, -1), ... , Offset(input, 3), ReplaceIndex(ivector, t, 0))" +print('output-node name=output input=Append({0})'.format(", ".join(list)), file=f) +f.close() + + +for l in range(1, num_hidden_layers + 1): + # the following summarizes the structure of the layers: Here, the Jesus component includes ReLU at its input and output, and renormalize + # at its output after the ReLU. + # layer1: splice + LDA-transform + affine + ReLU + renormalize + # layerX: splice + Jesus + affine + ReLU + + # Inside the jesus component is: + # [permute +] ReLU + repeated-affine + ReLU + repeated-affine + # [we make the repeated-affine the last one so we don't have to redo that in backprop]. + # We follow this with a post-jesus composite component containing the operations: + # [permute +] ReLU + renormalize + # call this post-jesusN. + # After this we use dim-range nodes to split up the output into + # [ jesusN-forward-output, jesusN-direct-output and jesusN-projected-output ] + # parts; + # and nodes for the jesusN-forward-affine. + + f = open(args.config_dir + "/layer{0}.config".format(l), "w") + print('# Config file for layer {0} of the network'.format(l), file=f) + if l == 1: + print('component name=lda type=FixedAffineComponent matrix={0}/lda.mat'. + format(args.config_dir), file=f) + splices = [ ('Offset(input, {0})'.format(n) if n != 0 else 'input') for n in splice_array[l-1] ] + if args.ivector_dim > 0: splices.append('ReplaceIndex(ivector, t, 0)') + orig_input='Append({0})'.format(', '.join(splices)) + # e.g. orig_input = 'Append(Offset(input, -2), ... Offset(input, 2), ivector)' + print('component-node name=lda component=lda input={0}'.format(orig_input), + file=f) + # after the initial LDA transform, put a trainable affine layer and a ReLU, followed + # by a NormalizeComponent. + print('component name=affine1 type=NaturalGradientAffineComponent ' + 'input-dim={0} output-dim={1} bias-stddev=0'.format( + input_dim, args.jesus_forward_input_dim), file=f) + print('component-node name=affine1 component=affine1 input=lda', + file=f) + # the ReLU after the affine + print('component name=relu1 type=RectifiedLinearComponent dim={1} self-repair-scale={2}'.format( + l, args.jesus_forward_input_dim, args.self_repair_scale), file=f) + print('component-node name=relu1 component=relu1 input=affine1', file=f) + # the renormalize component after the ReLU + print ('component name=renorm1 type=NormalizeComponent dim={0} '.format( + args.jesus_forward_input_dim), file=f) + print('component-node name=renorm1 component=renorm1 input=relu1', file=f) + cur_output = 'renorm1' + cur_affine_output_dim = args.jesus_forward_input_dim + else: + splices = [] + spliced_dims = [] + for s in splice_array[l-1]: + # the connection from the previous layer + try: + offset = int(s) + # it's an integer offset. + splices.append('Offset({0}, {1})'.format(cur_output, offset)) + spliced_dims.append(cur_affine_output_dim) + except: + # it's not an integer offset, so assume it specifies the + # statistics-extraction. + stats = StatisticsConfig(s, cur_affine_output_dim, cur_output) + stats.WriteConfigs(f) + splices.append(stats.Descriptor()) + spliced_dims.extend(stats.OutputDims()) + + # get the input to the Jesus layer. + cur_input = 'Append({0})'.format(', '.join(splices)) + cur_dim = sum(spliced_dims) + + this_jesus_output_dim = args.jesus_forward_output_dim + + # As input to the Jesus component we'll append the spliced input and any + # mean/stddev-stats input, and the first thing inside the component that + # we do is rearrange the dimensions so that things pertaining to a + # particular block stay together. + + column_map = [] + for x in range(0, args.num_jesus_blocks): + dim_offset = 0 + for src_splice in spliced_dims: + src_block_size = src_splice / args.num_jesus_blocks + for y in range(0, src_block_size): + column_map.append(dim_offset + (x * src_block_size) + y) + dim_offset += src_splice + if sorted(column_map) != range(0, sum(spliced_dims)): + print("column_map is " + str(column_map)) + print("num_jesus_blocks is " + str(args.num_jesus_blocks)) + print("spliced_dims is " + str(spliced_dims)) + sys.exit("code error creating new column order") + + need_input_permute_component = (column_map != range(0, sum(spliced_dims))) + + # Now add the jesus component. + + permute_offset = (1 if need_input_permute_component else 0) + + if args.jesus_hidden_dim > 0: # normal case where we have jesus-hidden-dim. + num_sub_components = 4 + permute_offset + hidden_else_output_dim = args.jesus_hidden_dim + else: # no hidden part in jesus layer. + num_sub_components = 2 + permute_offset + hidden_else_output_dim = args.jesus_forward_output_dim + print('component name=jesus{0} type=CompositeComponent num-components={1}'.format( + l, num_sub_components), file=f, end='') + # print the sub-components of the CompositeComopnent on the same line. + # this CompositeComponent has the same effect as a sequence of + # components, but saves memory. + if need_input_permute_component: + print(" component1='type=PermuteComponent column-map={1}'".format( + l, ','.join([str(x) for x in column_map])), file=f, end='') + print(" component{0}='type=RectifiedLinearComponent dim={1} self-repair-scale={2}'".format( + 1 + permute_offset, + cur_dim, args.self_repair_scale), file=f, end='') + + if args.use_repeated_affine == "true": + print(" component{0}='type=NaturalGradientRepeatedAffineComponent input-dim={1} output-dim={2} " + "num-repeats={3} param-stddev={4} bias-mean={5} bias-stddev=0'".format( + 2 + permute_offset, + cur_dim, hidden_else_output_dim, + args.num_jesus_blocks, + args.jesus_stddev_scale / math.sqrt(cur_dim / args.num_jesus_blocks), + 0.5 * args.jesus_stddev_scale), + file=f, end='') + else: + print(" component{0}='type=BlockAffineComponent input-dim={1} output-dim={2} " + "num-blocks={3} param-stddev={4} bias-stddev=0'".format( + 2 + permute_offset, + cur_dim, hidden_else_output_dim, + args.num_jesus_blocks, + args.jesus_stddev_scale / math.sqrt(cur_dim / args.num_jesus_blocks)), + file=f, end='') + + if args.jesus_hidden_dim > 0: # normal case where we have jesus-hidden-dim. + print(" component{0}='type=RectifiedLinearComponent dim={1} self-repair-scale={2}'".format( + 3 + permute_offset, hidden_else_output_dim, + args.self_repair_scale), file=f, end='') + + if args.use_repeated_affine == "true": + print(" component{0}='type=NaturalGradientRepeatedAffineComponent input-dim={1} output-dim={2} " + "num-repeats={3} param-stddev={4} bias-mean={5} bias-stddev=0'".format( + 4 + permute_offset, + args.jesus_hidden_dim, + this_jesus_output_dim, + args.num_jesus_blocks, + args.jesus_stddev_scale / math.sqrt(args.jesus_hidden_dim / args.num_jesus_blocks), + 0.5 * args.jesus_stddev_scale), + file=f, end='') + else: + print(" component{0}='type=BlockAffineComponent input-dim={1} output-dim={2} " + "num-blocks={3} param-stddev={4} bias-stddev=0'".format( + 4 + permute_offset, + args.jesus_hidden_dim, + this_jesus_output_dim, + args.num_jesus_blocks, + args.jesus_stddev_scale / math.sqrt((args.jesus_hidden_dim / args.num_jesus_blocks))), + file=f, end='') + + print("", file=f) # print newline. + print('component-node name=jesus{0} component=jesus{0} input={1}'.format( + l, cur_input), file=f) + + # now print the post-Jesus component which consists of ReLU + + # renormalize. + + num_sub_components = 2 + print('component name=post-jesus{0} type=CompositeComponent num-components=2'.format(l), + file=f, end='') + + # still within the post-Jesus component, print the ReLU + print(" component1='type=RectifiedLinearComponent dim={0} self-repair-scale={1}'".format( + this_jesus_output_dim, args.self_repair_scale), file=f, end='') + # still within the post-Jesus component, print the NormalizeComponent + print(" component2='type=NormalizeComponent dim={0} '".format( + this_jesus_output_dim), file=f, end='') + print("", file=f) # print newline. + print('component-node name=post-jesus{0} component=post-jesus{0} input=jesus{0}'.format(l), + file=f) + + # handle the forward output, we need an affine node for this: + cur_affine_output_dim = (args.jesus_forward_input_dim if l < num_hidden_layers else args.final_hidden_dim) + print('component name=forward-affine{0} type=NaturalGradientAffineComponent ' + 'input-dim={1} output-dim={2} bias-stddev=0'. + format(l, args.jesus_forward_output_dim, cur_affine_output_dim), file=f) + print('component-node name=jesus{0}-forward-output-affine component=forward-affine{0} input=post-jesus{0}'.format( + l), file=f) + # for each recurrence delay, create an affine node followed by a + # clip-gradient node. [if there are multiple recurrences in the same layer, + # each one gets its own affine projection.] + + # The reason we set the param-stddev to 0 is out of concern that if we + # initialize to nonzero, this will encourage the corresponding inputs at + # the jesus layer to become small (to remove this random input), which + # in turn will make this component learn slowly (due to small + # derivatives). we set the bias-mean to 0.001 so that the ReLUs on the + # input of the Jesus layer are in the part of the activation that has a + # nonzero derivative- otherwise with this setup it would never learn. + + cur_output = 'jesus{0}-forward-output-affine'.format(l) + + + # with each new layer we regenerate the final-affine component, with a ReLU before it + # because the layers we printed don't end with a nonlinearity. + print('component name=final-relu type=RectifiedLinearComponent dim={0} self-repair-scale={1}'.format( + cur_affine_output_dim, args.self_repair_scale), file=f) + print('component-node name=final-relu component=final-relu input={0}'.format(cur_output), + file=f) + print('component name=final-affine type=NaturalGradientAffineComponent ' + 'input-dim={0} output-dim={1} learning-rate-factor={2} param-stddev=0.0 bias-stddev=0'.format( + cur_affine_output_dim, args.num_targets, + args.final_layer_learning_rate_factor), file=f) + print('component-node name=final-affine component=final-affine input=final-relu', + file=f) + # printing out the next two, and their component-nodes, for l > 1 is not + # really necessary as they will already exist, but it doesn't hurt and makes + # the structure clearer. + if args.include_log_softmax == "true": + print('component name=final-log-softmax type=LogSoftmaxComponent dim={0}'.format( + args.num_targets), file=f) + print('component-node name=final-log-softmax component=final-log-softmax ' + 'input=final-affine', file=f) + print('output-node name=output input=final-log-softmax', file=f) + else: + print('output-node name=output input=final-affine', file=f) + + if args.xent_regularize != 0.0: + xent_input = 'final-relu' + if l == num_hidden_layers and args.xent_separate_forward_affine == "true": + print('component name=forward-affine{0}-xent type=NaturalGradientAffineComponent ' + 'input-dim={1} output-dim={2} bias-stddev=0'. + format(l, args.jesus_forward_output_dim, args.final_hidden_dim), file=f) + print('component-node name=jesus{0}-forward-output-affine-xent component=forward-affine{0}-xent input=post-jesus{0}'.format( + l), file=f) + print('component name=final-relu-xent type=RectifiedLinearComponent dim={0} self-repair-scale={1}'.format( + args.final_hidden_dim, args.self_repair_scale), file=f) + print('component-node name=final-relu-xent component=final-relu-xent ' + 'input=jesus{0}-forward-output-affine-xent'.format(l), file=f) + xent_input = 'final-relu-xent' + + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 1.0 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + print('component name=final-affine-xent type=NaturalGradientAffineComponent ' + 'input-dim={0} output-dim={1} param-stddev=0.0 bias-stddev=0 learning-rate-factor={2}'.format( + cur_affine_output_dim, args.num_targets, 0.5 / args.xent_regularize), file=f) + print('component-node name=final-affine-xent component=final-affine-xent input={0}'.format( + xent_input), file=f) + print('component name=final-log-softmax-xent type=LogSoftmaxComponent dim={0}'.format( + args.num_targets), file=f) + print('component-node name=final-log-softmax-xent component=final-log-softmax-xent ' + 'input=final-affine-xent', file=f) + print('output-node name=output-xent input=final-log-softmax-xent', file=f) + + f.close() diff --git a/egs/wsj/s5/steps/nnet3/make_tdnn_configs.py b/egs/wsj/s5/steps/nnet3/make_tdnn_configs.py index 57ed753c8c1..8403c273a9d 100644 --- a/egs/wsj/s5/steps/nnet3/make_tdnn_configs.py +++ b/egs/wsj/s5/steps/nnet3/make_tdnn_configs.py @@ -15,12 +15,20 @@ help="Raw feature dimension, e.g. 13") parser.add_argument("--ivector-dim", type=int, help="iVector dimension, e.g. 100", default=0) +parser.add_argument("--include-log-softmax", type=str, + help="add the final softmax layer ", default="true", choices = ["false", "true"]) +parser.add_argument("--final-layer-normalize-target", type=float, + help="RMS target for final layer (set to <1 if final layer learns too fast", + default=1.0) parser.add_argument("--pnorm-input-dim", type=int, help="input dimension to p-norm nonlinearities") parser.add_argument("--pnorm-output-dim", type=int, help="output dimension of p-norm nonlinearities") parser.add_argument("--relu-dim", type=int, help="dimension of ReLU nonlinearities") +parser.add_argument("--use-presoftmax-prior-scale", type=str, + help="if true, a presoftmax-prior-scale is added", + choices=['true', 'false'], default = "true") parser.add_argument("--num-targets", type=int, help="number of network targets (e.g. num-pdf-ids/num-leaves)") parser.add_argument("config_dir", @@ -38,8 +46,8 @@ sys.exit("--splice-indexes argument is required"); if args.feat_dim is None or not (args.feat_dim > 0): sys.exit("--feat-dim argument is required"); -if args.num_targets is None or not (args.feat_dim > 0): - sys.exit("--feat-dim argument is required"); +if args.num_targets is None or not (args.num_targets > 0): + sys.exit("--num-targets argument is required"); if not args.relu_dim is None: if not args.pnorm_input_dim is None or not args.pnorm_output_dim is None: sys.exit("--relu-dim argument not compatible with " @@ -53,12 +61,16 @@ nonlin_input_dim = args.pnorm_input_dim nonlin_output_dim = args.pnorm_output_dim +if args.use_presoftmax_prior_scale == "true": + use_presoftmax_prior_scale = True +else: + use_presoftmax_prior_scale = False ## Work out splice_array e.g. splice_array = [ [ -3,-2,...3 ], [0], [-2,2], .. [ -8,8 ] ] splice_array = [] left_context = 0 right_context = 0 -split1 = args.splice_indexes.split(" "); # we already checked the string is nonempty. +split1 = args.splice_indexes.split(); # we already checked the string is nonempty. if len(split1) < 1: sys.exit("invalid --splice-indexes argument, too short: " + args.splice_indexes) @@ -125,19 +137,22 @@ print('# In nnet3 framework, p in P-norm is always 2.', file=f) print('component name=nonlin{0} type=PnormComponent input-dim={1} output-dim={2}'. format(l, args.pnorm_input_dim, args.pnorm_output_dim), file=f) - print('component name=renorm{0} type=NormalizeComponent dim={1}'.format( - l, nonlin_output_dim), file=f) + print('component name=renorm{0} type=NormalizeComponent dim={1} target-rms={2}'.format( + l, nonlin_output_dim, + (1.0 if l < num_hidden_layers else args.final_layer_normalize_target)), file=f) print('component name=final-affine type=NaturalGradientAffineComponent ' 'input-dim={0} output-dim={1} param-stddev=0 bias-stddev=0'.format( nonlin_output_dim, args.num_targets), file=f) # printing out the next two, and their component-nodes, for l > 1 is not # really necessary as they will already exist, but it doesn't hurt and makes # the structure clearer. - print('component name=final-fixed-scale type=FixedScaleComponent ' - 'scales={0}/presoftmax_prior_scale.vec'.format( - args.config_dir), file=f) - print('component name=final-log-softmax type=LogSoftmaxComponent dim={0}'.format( - args.num_targets), file=f) + if args.include_log_softmax == "true": + if use_presoftmax_prior_scale : + print('component name=final-fixed-scale type=FixedScaleComponent ' + 'scales={0}/presoftmax_prior_scale.vec'.format( + args.config_dir), file=f) + print('component name=final-log-softmax type=LogSoftmaxComponent dim={0}'.format( + args.num_targets), file=f) print('# Now for the network structure', file=f) if l == 1: splices = [ ('Offset(input, {0})'.format(n) if n != 0 else 'input') for n in splice_array[l-1] ] @@ -161,11 +176,19 @@ print('component-node name=final-affine component=final-affine input=renorm{0}'. format(l), file=f) - print('component-node name=final-fixed-scale component=final-fixed-scale input=final-affine', - file=f) - print('component-node name=final-log-softmax component=final-log-softmax ' - 'input=final-fixed-scale', file=f) - print('output-node name=output input=final-log-softmax', file=f) + + if args.include_log_softmax == "true": + if use_presoftmax_prior_scale: + print('component-node name=final-fixed-scale component=final-fixed-scale input=final-affine', + file=f) + print('component-node name=final-log-softmax component=final-log-softmax ' + 'input=final-fixed-scale', file=f) + else: + print('component-node name=final-log-softmax component=final-log-softmax ' + 'input=final-affine', file=f) + print('output-node name=output input=final-log-softmax', file=f) + else: + print('output-node name=output input=final-affine', file=f) f.close() # component name=nonlin1 type=PnormComponent input-dim=$pnorm_input_dim output-dim=$pnorm_output_dim diff --git a/egs/wsj/s5/steps/nnet3/nnet3_to_dot.sh b/egs/wsj/s5/steps/nnet3/nnet3_to_dot.sh index 24666b8bd02..c36de8c16bf 100755 --- a/egs/wsj/s5/steps/nnet3/nnet3_to_dot.sh +++ b/egs/wsj/s5/steps/nnet3/nnet3_to_dot.sh @@ -1,11 +1,12 @@ #!/bin/bash # script showing use of nnet3_to_dot.py -# Copyright 2015 Johns Hopkins University (Author: Vijayaditya Peddinti). +# Copyright 2015 Johns Hopkins University (Author: Vijayaditya Peddinti). # Begin configuration section. component_attributes="name,type" node_prefixes="" +info_bin=nnet3-am-info echo "$0 $@" # Print the command line for logging [ -f ./path.sh ] && . ./path.sh; # source the path. @@ -20,7 +21,7 @@ if [ $# != 3 ]; then echo " --node-prefixes # list of prefixes. Nnet3 components/component-nodes with the same prefix" echo " # will be clustered together in the dot-graph" - + exit 1; fi @@ -29,10 +30,10 @@ dot_file=$2 output_file=$3 attr=${node_prefixes:+ --node-prefixes "$node_prefixes"} -nnet3-am-info $model | \ +$info_bin $model | \ steps/nnet3/dot/nnet3_to_dot.py \ --component-attributes "$component_attributes" \ - $attr > $dot_file + $attr $dot_file command -v dot >/dev/null 2>&1 || { echo >&2 "This script requires dot but it's not installed. Please compile $dot_file with dot"; exit 1; } -dot -Tpng $dot_file -o $output_file +dot -Tpdf $dot_file -o $output_file diff --git a/egs/wsj/s5/steps/nnet3/nnet3_train_lib.py b/egs/wsj/s5/steps/nnet3/nnet3_train_lib.py new file mode 100644 index 00000000000..cbe2245652b --- /dev/null +++ b/egs/wsj/s5/steps/nnet3/nnet3_train_lib.py @@ -0,0 +1,702 @@ +import subprocess +import logging +import math +import re +import time +import argparse +import datetime as dt + +logger = logging.getLogger(__name__) +logger.setLevel(logging.INFO) +handler = logging.StreamHandler() +handler.setLevel(logging.INFO) +formatter = logging.Formatter('%(asctime)s [%(filename)s:%(lineno)s - %(funcName)s - %(levelname)s ] %(message)s') +handler.setFormatter(formatter) +logger.addHandler(handler) + + +def SendMail(message, subject, email_id): + try: + subprocess.Popen('echo "{message}" | mail -s "{subject}" {email} '.format( + message = message, + subject = subject, + email = email_id), shell=True) + except Exception as e: + logger.info(" Unable to send mail due to error:\n {error}".format(error = str(e))) + pass + +class StrToBoolAction(argparse.Action): + """ A custom action to convert bools from shell format i.e., true/false + to python format i.e., True/False """ + def __call__(self, parser, namespace, values, option_string=None): + if values == "true": + setattr(namespace, self.dest, True) + elif values == "false": + setattr(namespace, self.dest, False) + else: + raise Exception("Unknown value {0} for --{1}".format(values, self.dest)) + +class NullstrToNoneAction(argparse.Action): + """ A custom action to convert empty strings passed by shell + to None in python. This is necessary as shell scripts print null strings + when a variable is not specified. We could use the more apt None + in python. """ + def __call__(self, parser, namespace, values, option_string=None): + if values.strip() == "": + setattr(namespace, self.dest, None) + else: + setattr(namespace, self.dest, values) + + +def CheckIfCudaCompiled(): + p = subprocess.Popen("cuda-compiled") + p.communicate() + if p.returncode == 1: + return False + else: + return True + +def RunKaldiCommand(command, wait = True, measure_time = False): + """ Runs commands frequently seen in Kaldi scripts. These are usually a + sequence of commands connected by pipes, so we use shell=True """ + #logger.info("Running the command\n{0}".format(command)) + start_time = dt.datetime.now() + p = subprocess.Popen(command, shell = True, + stdout = subprocess.PIPE, + stderr = subprocess.PIPE) + end_time = dt.datetime.now() + if measure_time: + duration = end_time - start_time + logger.info("Ran for {0} seconds".format(duration.seconds)) + if wait: + [stdout, stderr] = p.communicate() + if p.returncode is not 0: + raise Exception("There was an error while running the command {0}\n".format(command)+"-"*10+"\n"+stderr) + return stdout, stderr + else: + return p + +def GetSuccessfulModels(num_models, log_file_pattern, difference_threshold=1.0): + assert(num_models > 0) + + parse_regex = re.compile("LOG .* Overall average objective function for 'output' is ([0-9e.\-+]+) over ([0-9e.\-+]+) frames") + objf = [] + for i in range(num_models): + model_num = i + 1 + logfile = re.sub('%', str(model_num), log_file_pattern) + lines = open(logfile, 'r').readlines() + this_objf = -100000 + for line_num in range(1, len(lines) + 1): + # we search from the end as this would result in + # lesser number of regex searches. Python regex is slow ! + mat_obj = parse_regex.search(lines[-1*line_num]) + if mat_obj is not None: + this_objf = float(mat_obj.groups()[0]) + break; + objf.append(this_objf); + max_index = objf.index(max(objf)) + accepted_models = [] + for i in range(num_models): + if (objf[max_index] - objf[i]) <= difference_threshold: + accepted_models.append(i+1) + + if len(accepted_models) != num_models: + logger.warn("Only {0}/{1} of the models have been accepted for averaging, based on log files {2}.".format(len(accepted_models), num_models, log_file_pattern)) + + return [accepted_models, max_index+1] + +def GetNumberOfLeaves(alidir): + [stdout, stderr] = RunKaldiCommand("tree-info {0}/tree 2>/dev/null | grep num-pdfs".format(alidir)) + parts = stdout.split() + assert(parts[0] == "num-pdfs") + num_leaves = int(parts[1]) + if num_leaves == 0: + raise Exception("Number of leaves is 0") + return num_leaves + +def GetNumberOfJobs(alidir): + try: + num_jobs = int(open('{0}/num_jobs'.format(alidir), 'r').readline().strip()) + except IOError, ValueError: + raise Exception('Exception while reading the number of alignment jobs') + return num_jobs +def GetIvectorDim(ivector_dir = None): + if ivector_dir is None: + return 0 + [stdout_val, stderr_val] = RunKaldiCommand("feat-to-dim --print-args=false scp:{dir}/ivector_online.scp -".format(dir = ivector_dir)) + ivector_dim = int(stdout_val) + return ivector_dim + +def GetFeatDim(feat_dir): + [stdout_val, stderr_val] = RunKaldiCommand("feat-to-dim --print-args=false scp:{data}/feats.scp -".format(data = feat_dir)) + feat_dim = int(stdout_val) + return feat_dim + +def ReadKaldiMatrix(matrix_file): + try: + lines = map(lambda x: x.split(), open(matrix_file).readlines()) + first_field = lines[0][0] + last_field = lines[-1][-1] + lines[0] = lines[0][1:] + lines[-1] = lines[-1][:-1] + if not (first_field == "[" and last_field == "]"): + raise Exception("Kaldi matrix file has incorrect format, only text format matrix files can be read by this script") + for i in range(len(lines)): + lines[i] = map(lambda x: int(float(x)), lines[i]) + return lines + except IOError: + raise Exception("Error while reading the kaldi matrix file {0}".format(matrix_file)) + +def WriteKaldiMatrix(output_file, matrix): + # matrix is a list of lists + file = open(output_file, 'w') + file.write("[ ") + num_rows = len(matrix) + if num_rows == 0: + raise Exception("Matrix is empty") + num_cols = len(matrix[0]) + + for row_index in range(len(matrix)): + if num_cols != len(matrix[row_index]): + raise Exception("All the rows of a matrix are expected to have the same length") + file.write(" ".join(map(lambda x: str(x), matrix[row_index]))) + if row_index != num_rows - 1: + file.write("\n") + file.write(" ]") + file.close() + +import shutil +def CopyEgsPropertiesToExpDir(egs_dir, dir): + try: + for file in ['cmvn_opts', 'splice_opts', 'final.mat']: + file_name = '{dir}/{file}'.format(dir = egs_dir, file = file) + if os.path.isfile(file_name): + shutil.copy2(file_name, dir) + except IOError: + raise Exception("Error while trying to copy egs property files to {dir}".format(dir = dir)) + +def SplitData(data, num_jobs): + RunKaldiCommand("utils/split_data.sh {data} {num_jobs}".format(data = data, + num_jobs = num_jobs)) + +def ParseModelConfigVarsFile(var_file): + try: + var_file_handle = open(var_file, 'r') + model_left_context = None + model_right_context = None + num_hidden_layers = None + for line in var_file_handle: + parts = line.split('=') + field_name = parts[0].strip() + field_value = parts[1] + if field_name in ['model_left_context', 'left_context']: + model_left_context = int(field_value) + elif field_name in ['model_right_context', 'right_context']: + model_right_context = int(field_value) + elif field_name == 'num_hidden_layers': + num_hidden_layers = int(field_value) + + if model_left_context is not None and model_right_context is not None and num_hidden_layers is not None: + return [model_left_context, model_right_context, num_hidden_layers] + + except ValueError: + # we will throw an error at the end of the function so I will just pass + pass + + raise Exception('Error while parsing the file {0}'.format(var_file)) + + +def GenerateEgs(data, alidir, egs_dir, + left_context, right_context, + valid_left_context, valid_right_context, + run_opts, stage = 0, + feat_type = 'raw', online_ivector_dir = None, + samples_per_iter = 20000, frames_per_eg = 20, + egs_opts = None, cmvn_opts = None, transform_dir = None): + + RunKaldiCommand(""" +steps/nnet3/get_egs.sh {egs_opts} \ + --cmd "{command}" \ + --cmvn-opts "{cmvn_opts}" \ + --feat-type {feat_type} \ + --transform-dir "{transform_dir}" \ + --online-ivector-dir "{ivector_dir}" \ + --left-context {left_context} --right-context {right_context} \ + --valid-left-context {valid_left_context} \ + --valid-right-context {valid_right_context} \ + --stage {stage} \ + --samples-per-iter {samples_per_iter} \ + --frames-per-eg {frames_per_eg} \ + {data} {alidir} {egs_dir} + """.format(command = run_opts.command, + cmvn_opts = cmvn_opts if cmvn_opts is not None else '', + feat_type = feat_type, + transform_dir = transform_dir if transform_dir is not None else '', + ivector_dir = online_ivector_dir if online_ivector_dir is not None else '', + left_context = left_context, right_context = right_context, + valid_left_context = valid_left_context, + valid_right_context = valid_right_context, + stage = stage, samples_per_iter = samples_per_iter, + frames_per_eg = frames_per_eg, data = data, alidir = alidir, + egs_dir = egs_dir, + egs_opts = egs_opts if egs_opts is not None else '' )) + +def VerifyEgsDir(egs_dir, feat_dim, ivector_dim, left_context, right_context): + try: + egs_feat_dim = int(open('{0}/info/feat_dim'.format(egs_dir)).readline()) + egs_ivector_dim = int(open('{0}/info/ivector_dim'.format(egs_dir)).readline()) + egs_left_context = int(open('{0}/info/left_context'.format(egs_dir)).readline()) + egs_right_context = int(open('{0}/info/right_context'.format(egs_dir)).readline()) + if (feat_dim != egs_feat_dim) or (ivector_dim != egs_ivector_dim): + raise Exception('There is mismatch between featdim/ivector_dim of the current experiment and the provided egs directory') + + if (egs_left_context < left_context) or (egs_right_context < right_context): + raise Exception("""The egs have insufficient context. +Required left context is {left_req_ctx} and available context is {left_av_ctx}. +Required right context is {right_req_ctx} and available context is {right_av_ctx}. +""".format(left_req_ctx = left_context, left_av_ctx = egs_left_context, + right_req_ctx = right_context, right_av_ctx = right_context)) + + frames_per_eg = int(open('{0}/info/frames_per_eg'.format(egs_dir)).readline()) + num_archives = int(open('{0}/info/num_archives'.format(egs_dir)).readline()) + + return [egs_left_context, egs_right_context, frames_per_eg, num_archives] + except IOError, ValueError: + raise Exception('The egs dir {0} has missing or malformed files'.format(egs_dir)) + +def ComputePreconditioningMatrix(dir, egs_dir, num_lda_jobs, run_opts, + max_lda_jobs = None, rand_prune = 4.0, + lda_opts = None): + if max_lda_jobs is not None: + if num_lda_jobs > max_lda_jobs: + num_lda_jobs = max_lda_jobs + + RunKaldiCommand(""" +{command} JOB=1:{num_lda_jobs} {dir}/log/get_lda_stats.JOB.log \ + nnet3-acc-lda-stats --rand-prune={rand_prune} \ + {dir}/init.raw "ark:{egs_dir}/egs.JOB.ark" {dir}/JOB.lda_stats""".format( + command = run_opts.command, + num_lda_jobs = num_lda_jobs, + dir = dir, + egs_dir = egs_dir, + rand_prune = rand_prune)) + + # the above command would have generated dir/{1..num_lda_jobs}.lda_stats + lda_stat_files = map(lambda x: '{0}/{1}.lda_stats'.format(dir, x), + range(1, num_lda_jobs + 1)) + + RunKaldiCommand(""" +{command} {dir}/log/sum_transform_stats.log \ + sum-lda-accs {dir}/lda_stats {lda_stat_files}""".format( + command = run_opts.command, + dir = dir, lda_stat_files = " ".join(lda_stat_files))) + + for file in lda_stat_files: + try: + os.remove(file) + except OSError: + raise Exception("There was error while trying to remove lda stat files.") + # this computes a fixed affine transform computed in the way we described in + # Appendix C.6 of http://arxiv.org/pdf/1410.7455v6.pdf; it's a scaled variant + # of an LDA transform but without dimensionality reduction. + + RunKaldiCommand(""" +{command} {dir}/log/get_transform.log \ + nnet-get-feature-transform {lda_opts} {dir}/lda.mat {dir}/lda_stats + """.format(command = run_opts.command,dir = dir, + lda_opts = lda_opts if lda_opts is not None else "")) + + ForceSymlink("../lda.mat", "{0}/configs/lda.mat".format(dir)) + +import os, errno + +def ForceSymlink(file1, file2): + try: + os.symlink(file1, file2) + except OSError, e: + if e.errno == errno.EEXIST: + os.remove(file2) + os.symlink(file1, file2) + +def ComputePresoftmaxPriorScale(dir, alidir, num_jobs, run_opts, + presoftmax_prior_scale_power = None): + + # getting the raw pdf count + RunKaldiCommand(""" +{command} JOB=1:{num_jobs} {dir}/log/acc_pdf.JOB.log \ +ali-to-post "ark:gunzip -c {alidir}/ali.JOB.gz|" ark:- \| \ +post-to-tacc --per-pdf=true {alidir}/final.mdl ark:- {dir}/pdf_counts.JOB + """.format(command = run_opts.command, + num_jobs = num_jobs, + dir = dir, + alidir = alidir)) + + RunKaldiCommand(""" +{command} {dir}/log/sum_pdf_counts.log \ +vector-sum --binary=false {dir}/pdf_counts.* {dir}/pdf_counts + """.format(command = run_opts.command, dir = dir)) + + import glob + for file in glob.glob('{0}/pdf_counts.*'.format(dir)): + os.remove(file) + + smooth=0.01 + pdf_counts = ReadKaldiMatrix('{0}/pdf_counts'.format(dir))[0] + total = sum(pdf_counts) + average_count = total/len(pdf_counts) + scales = [] + for i in range(len(pdf_counts)): + scales.append(math.pow(pdf_counts[i] + smooth * average_count, presoftmax_prior_scale_power)) + num_pdfs = len(pdf_counts) + scaled_counts = map(lambda x: x * float(num_pdfs) / sum(scales), scales) + + output_file = "{0}/presoftmax_prior_scale.vec".format(dir) + WriteKaldiMatrix(output_file, [scaled_counts]) + ForceSymlink("../presoftmax_prior_scale.vec", "{0}/configs/presoftmax_prior_scale.vec".format(dir)) + +def PrepareInitialAcousticModel(dir, alidir, run_opts): + """ Adds the first layer; this will also add in the lda.mat and + presoftmax_prior_scale.vec. It will also prepare the acoustic model + with the transition model.""" + + RunKaldiCommand(""" +{command} {dir}/log/add_first_layer.log \ + nnet3-init --srand=-3 {dir}/init.raw {dir}/configs/layer1.config {dir}/0.raw """.format(command = run_opts.command, + dir = dir)) + + # Convert to .mdl, train the transitions, set the priors. + RunKaldiCommand(""" +{command} {dir}/log/init_mdl.log \ + nnet3-am-init {alidir}/final.mdl {dir}/0.raw - \| \ + nnet3-am-train-transitions - "ark:gunzip -c {alidir}/ali.*.gz|" {dir}/0.mdl + """.format(command = run_opts.command, + dir = dir, alidir = alidir)) + +def VerifyIterations(num_iters, num_epochs, num_hidden_layers, + num_archives, max_models_combine, add_layers_period, + num_jobs_final): + """ Verifies that number of iterations are sufficient for various + phases of training.""" + + finish_add_layers_iter = num_hidden_layers * add_layers_period + + if num_iters <= (finish_add_layers_iter + 2): + raise Exception(' There are insufficient number of epochs. These are not even sufficient for layer-wise discriminatory training.') + + + approx_iters_per_epoch_final = num_archives/num_jobs_final + # First work out how many iterations we want to combine over in the final + # nnet3-combine-fast invocation. (We may end up subsampling from these if the + # number exceeds max_model_combine). The number we use is: + # min(max(max_models_combine, approx_iters_per_epoch_final), + # 1/2 * iters_after_last_layer_added) + half_iters_after_add_layers = (num_iters - finish_add_layers_iter)/2 + num_iters_combine = min(max(max_models_combine, approx_iters_per_epoch_final), half_iters_after_add_layers) + return num_iters_combine + +def GetRealignIters(realign_times, num_iters, + num_jobs_initial, num_jobs_final): + """ Takes the realign_times string and identifies the approximate + iterations at which realignments have to be done.""" + # realign_times is a space seperated string of values between 0 and 1 + + realign_iters = [] + for realign_time in realign_times.split(): + realign_time = float(realign_time) + assert(realign_time > 0 and realign_time < 1) + if num_jobs_initial == num_jobs_final: + realign_iter = int(0.5 + num_iters * realign_time) + else: + realign_iter = math.sqrt((1 - realign_time) * math.pow(num_jobs_initial, 2) + + realign_time * math.pow(num_jobs_final, 2)) + realign_iter = realign_iter - num_jobs_initial + realign_iter = realign_iter / (num_jobs_final - num_jobs_initial) + realign_iter = realign_iter * num_iters + realign_iters.append(int(realign_iter)) + + return realign_iters + +def Align(dir, data, lang, run_opts, iter = None, transform_dir = None, + online_ivector_dir = None): + + alidir = '{dir}/ali{ali_suffix}'.format(dir = dir, + ali_suffix = "_iter_{0}".format(iter) if iter is not None else "") + + logger.info("Aligning the data{gpu}with {num_jobs} jobs.".format( + gpu = " using gpu " if run_opts.realign_use_gpu else " ", + num_jobs = run_opts.realign_num_jobs )) + RunKaldiCommand(""" +steps/nnet3/align.sh --nj {num_jobs_align} --cmd "{align_cmd} {align_queue_opt}" \ + --use-gpu {align_use_gpu} \ + --transform-dir "{transform_dir}" \ + --online-ivector-dir "{online_ivector_dir}" \ + --iter "{iter}" {data} {lang} {dir} {alidir} + """.format(dir = dir, align_use_gpu = "yes" if run_opts.realign_use_gpu else "no", + align_cmd = run_opts.realign_command, + align_queue_opt = run_opts.realign_queue_opt, + num_jobs_align = run_opts.realign_num_jobs, + transform_dir = transform_dir if transform_dir is not None else "", + online_ivector_dir = online_ivector_dir if online_ivector_dir is not None else "", + iter = iter if iter is not None else "", + alidir = alidir, + lang = lang, data = data)) + return alidir + +def Realign(dir, iter, feat_dir, lang, prev_egs_dir, cur_egs_dir, + prior_subset_size, num_archives, run_opts, + transform_dir = None, online_ivector_dir = None): + raise Exception("Realignment stage has not been implemented in nnet3") + logger.info("Getting average posterior for purposes of adjusting the priors.") + # Note: this just uses CPUs, using a smallish subset of data. + # always use the first egs archive, which makes the script simpler; + # we're using different random subsets of it. + + avg_post_vec_file = ComputeAveragePosterior(dir, iter, prev_egs_dir, + num_archives, prior_subset_size, run_opts) + + avg_post_vec_file = "{dir}/post.{iter}.vec".format(dir=dir, iter=iter) + logger.info("Re-adjusting priors based on computed posteriors") + model = '{0}/{1}.mdl'.format(dir, iter) + AdjustAmPriors(dir, model, avg_post_vec_file, model, run_opts) + + alidir = Align(dir, feat_dir, lang, run_opts, iter, + transform_dir, online_ivector_dir) + RunKaldiCommand(""" +steps/nnet3/relabel_egs.sh --cmd "{command}" --iter {iter} {alidir} \ + {prev_egs_dir} {cur_egs_dir}""".format( + command = run_opts.command, + iter = iter, + dir = dir, + alidir = alidir, + prev_egs_dir = prev_egs_dir, + cur_egs_dir = cur_egs_dir)) + +def GetLearningRate(iter, num_jobs, num_iters, num_archives_processed, + num_archives_to_process, + initial_effective_lrate, final_effective_lrate): + if iter + 1 >= num_iters: + effective_learning_rate = final_effective_lrate + else: + effective_learning_rate = initial_effective_lrate * math.exp(num_archives_processed * math.log(final_effective_lrate/ initial_effective_lrate)/num_archives_to_process) + + return num_jobs * effective_learning_rate + +def DoShrinkage(iter, model_file, non_linearity, shrink_threshold): + + if iter == 0: + return True + + try: + output, error = RunKaldiCommand("nnet3-am-info --print-args=false {model_file} | grep {non_linearity}".format(non_linearity = non_linearity, model_file = model_file)) + output = output.strip().split("\n") + # eg. + # component name=Lstm1_f type=SigmoidComponent, dim=1280, count=5.02e+05, value-avg=[percentiles(0,1,2,5 10,20,50,80,90 95,98,99,100)=(0.06,0.17,0.19,0.24 0.28,0.33,0.44,0.62,0.79 0.96,0.99,1.0,1.0), mean=0.482, stddev=0.198], deriv-avg=[percentiles(0,1,2,5 10,20,50,80,90 95,98,99,100)=(0.0001,0.003,0.004,0.03 0.12,0.18,0.22,0.24,0.25 0.25,0.25,0.25,0.25), mean=0.198, stddev=0.0591] + + mean_pattern = re.compile(".*deriv-avg=.*mean=([0-9\.]+).*") + total_mean_deriv = 0 + num_derivs = 0 + for line in output: + mat_obj = mean_pattern.search(line) + if mat_obj is None: + raise Exception("Something went wrong, unable to find deriv-avg in the line \n{0}".format(line)) + mean_deriv = float(mat_obj.groups()[0]) + total_mean_deriv += mean_deriv + num_derivs += 1 + if total_mean_deriv / num_derivs < shrink_threshold: + return True + except ValueError: + raise Exception("Error while parsing the model info output") + + return False + +def ComputeTrainCvProbabilities(dir, iter, egs_dir, run_opts, wait = False): + + model = '{0}/{1}.mdl'.format(dir, iter) + + RunKaldiCommand(""" +{command} {dir}/log/compute_prob_valid.{iter}.log \ + nnet3-compute-prob "nnet3-am-copy --raw=true {model} - |" \ + "ark,bg:nnet3-merge-egs ark:{egs_dir}/valid_diagnostic.egs ark:- |" + """.format(command = run_opts.command, + dir = dir, + iter = iter, + model = model, + egs_dir = egs_dir), wait = wait) + + RunKaldiCommand(""" +{command} {dir}/log/compute_prob_train.{iter}.log \ + nnet3-compute-prob "nnet3-am-copy --raw=true {model} - |" \ + "ark,bg:nnet3-merge-egs ark:{egs_dir}/train_diagnostic.egs ark:- |" + """.format(command = run_opts.command, + dir = dir, + iter = iter, + model = model, + egs_dir = egs_dir), wait = wait) + + +def ComputeProgress(dir, iter, egs_dir, run_opts, wait=False): + + prev_model = '{0}/{1}.mdl'.format(dir, iter - 1) + model = '{0}/{1}.mdl'.format(dir, iter) + RunKaldiCommand(""" +{command} {dir}/log/progress.{iter}.log \ +nnet3-info "nnet3-am-copy --raw=true {model} - |" '&&' \ +nnet3-show-progress --use-gpu=no "nnet3-am-copy --raw=true {prev_model} - |" "nnet3-am-copy --raw=true {model} - |" \ +"ark,bg:nnet3-merge-egs --minibatch-size=256 ark:{egs_dir}/train_diagnostic.egs ark:-|" + """.format(command = run_opts.command, + dir = dir, + iter = iter, + model = model, + prev_model = prev_model, + egs_dir = egs_dir), wait = wait) + +def CombineModels(dir, num_iters, num_iters_combine, egs_dir, + run_opts, chunk_width = None): + # Now do combination. In the nnet3 setup, the logic + # for doing averaging of subsets of the models in the case where + # there are too many models to reliably esetimate interpolation + # factors (max_models_combine) is moved into the nnet3-combine + raw_model_strings = [] + print num_iters_combine + for iter in range(num_iters - num_iters_combine + 1, num_iters + 1): + model_file = '{0}/{1}.mdl'.format(dir, iter) + if not os.path.exists(model_file): + raise Exception('Model file {0} missing'.format(model_file)) + raw_model_strings.append('"nnet3-am-copy --raw=true {0} -|"'.format(model_file)) + + if chunk_width is not None: + # this is an RNN model + mbsize = int(1024.0/(chunk_width)) + else: + mbsize = 1024 + + RunKaldiCommand(""" +{command} {combine_queue_opt} {dir}/log/combine.log \ +nnet3-combine --num-iters=40 \ + --enforce-sum-to-one=true --enforce-positive-weights=true \ + --verbose=3 {raw_models} "ark,bg:nnet3-merge-egs --measure-output-frames=false --minibatch-size={mbsize} ark:{egs_dir}/combine.egs ark:-|" \ +"|nnet3-am-copy --set-raw-nnet=- {dir}/{num_iters}.mdl {dir}/combined.mdl" + """.format(command = run_opts.command, + combine_queue_opt = run_opts.combine_queue_opt, + dir = dir, raw_models = " ".join(raw_model_strings), + mbsize = mbsize, + num_iters = num_iters, + egs_dir = egs_dir)) + + # Compute the probability of the final, combined model with + # the same subset we used for the previous compute_probs, as the + # different subsets will lead to different probs. + ComputeTrainCvProbabilities(dir, 'combined', egs_dir, run_opts, wait = False) + +def ComputeAveragePosterior(dir, iter, egs_dir, num_archives, + prior_subset_size, run_opts): + # Note: this just uses CPUs, using a smallish subset of data. + """ Computes the average posterior of the network""" + import glob + for file in glob.glob('{0}/post.{1}.*.vec'.format(dir, iter)): + os.remove(file) + + if run_opts.num_jobs_compute_prior > num_archives: + egs_part = 1 + else: + egs_part = 'JOB' + + RunKaldiCommand(""" +{command} JOB=1:{num_jobs_compute_prior} {prior_queue_opt} {dir}/log/get_post.{iter}.JOB.log \ + nnet3-subset-egs --srand=JOB --n={prior_subset_size} ark:{egs_dir}/egs.{egs_part}.ark ark:- \| \ + nnet3-merge-egs --measure-output-frames=true --minibatch-size=128 ark:- ark:- \| \ + nnet3-compute-from-egs {prior_gpu_opt} --apply-exp=true \ + "nnet3-am-copy --raw=true {dir}/combined.mdl -|" ark:- ark:- \| \ +matrix-sum-rows ark:- ark:- \| vector-sum ark:- {dir}/post.{iter}.JOB.vec + """.format(command = run_opts.command, + dir = dir, + num_jobs_compute_prior = run_opts.num_jobs_compute_prior, + prior_queue_opt = run_opts.prior_queue_opt, + iter = iter, prior_subset_size = prior_subset_size, + egs_dir = egs_dir, egs_part = egs_part, + prior_gpu_opt = run_opts.prior_gpu_opt)) + + # make sure there is time for $dir/post.{iter}.*.vec to appear. + time.sleep(5) + avg_post_vec_file = "{dir}/post.{iter}.vec".format(dir=dir, iter=iter) + RunKaldiCommand(""" +{command} {dir}/log/vector_sum.{iter}.log \ + vector-sum {dir}/post.{iter}.*.vec {output_file} + """.format(command = run_opts.command, + dir = dir, iter = iter, output_file = avg_post_vec_file)) + + for file in glob.glob('{0}/post.{1}.*.vec'.format(dir, iter)): + os.remove(file) + return avg_post_vec_file + +def AdjustAmPriors(dir, input_model, avg_posterior_vector, output_model, run_opts): + RunKaldiCommand(""" +{command} {dir}/log/adjust_priors.final.log \ +nnet3-am-adjust-priors {input_model} {avg_posterior_vector} {output_model} + """.format(command = run_opts.command, + dir = dir, input_model = input_model, + avg_posterior_vector = avg_posterior_vector, + output_model = output_model)) + +def RemoveEgs(egs_dir): + RunKaldiCommand("steps/nnet2/remove_egs.sh {egs_dir}".format(egs_dir=egs_dir)) + +def CleanNnetDir(nnet_dir, num_iters, egs_dir, num_iters_combine = None, + preserve_model_interval = 100, + remove_egs = True): + try: + if remove_egs: + RemoveEgs(egs_dir) + + for iter in range(num_iters): + RemoveModel(nnet_dir, iter, num_iters, 1, + preserve_model_interval) + except (IOError, OSError) as err: + logger.warning("Error while cleaning up the nnet directory") + raise err + +def RemoveModel(nnet_dir, iter, num_iters, num_iters_combine = None, + preserve_model_interval = 100): + if iter % preserve_model_interval == 0: + return + if num_iters_combine is not None and iter >= num_iters - num_iters_combine + 1 : + return + file_name = '{0}/{1}.mdl'.format(nnet_dir, iter) + if os.path.isfile(file_name): + os.remove(file_name) + +def ComputeLifterCoeffs(lifter, dim): + coeffs = [0] * dim + for i in range(0, dim): + coeffs[i] = 1.0 + 0.5 * lifter * math.sin(math.pi * i / float(lifter)); + + return coeffs + +def ComputeIdctMatrix(K, N, cepstral_lifter=0): + matrix = [[0] * K for i in range(N)] + # normalizer for X_0 + normalizer = math.sqrt(1.0 / float(N)); + for j in range(0, N): + matrix[j][0] = normalizer; + # normalizer for other elements + normalizer = math.sqrt(2.0 / float(N)); + for k in range(1, K): + for n in range(0, N): + matrix[n][k] = normalizer * math.cos(math.pi / float(N) * (n + 0.5) * k); + + if cepstral_lifter != 0: + lifter_coeffs = ComputeLifterCoeffs(cepstral_lifter, K) + for k in range(0, K): + for n in range(0, N): + matrix[n][k] = matrix[n][k] / lifter_coeffs[k]; + + return matrix + +def WriteIdctMatrix(feat_dim, cepstral_lifter, file_path): + # generate the IDCT matrix and write to the file + idct_matrix = ComputeIdctMatrix(feat_dim, feat_dim, cepstral_lifter) + # append a zero column to the matrix, this is the bias of the fixed affine component + for k in range(0, feat_dim): + idct_matrix[k].append(0) + WriteKaldiMatrix(file_path, idct_matrix) + diff --git a/egs/wsj/s5/steps/nnet3/report/generate_plots.py b/egs/wsj/s5/steps/nnet3/report/generate_plots.py new file mode 100755 index 00000000000..bed8abd132b --- /dev/null +++ b/egs/wsj/s5/steps/nnet3/report/generate_plots.py @@ -0,0 +1,298 @@ +#!/usr/bin/env python + + +# Copyright 2016 Vijayaditya Peddinti. +# Apache 2.0. + +import warnings +import imp +import argparse +import os +import errno +import logging +import re +import subprocess +train_lib = imp.load_source('ntl', 'steps/nnet3/nnet3_train_lib.py') + +try: + import matplotlib as mpl + mpl.use('Agg') + import matplotlib.pyplot as plt + from matplotlib.backends.backend_pdf import PdfPages + import numpy as np + + plot = True +except ImportError: + warnings.warn(""" +This script requires matplotlib and numpy. Please install them to generate plots. Proceeding with generation of tables. +If you are on a cluster where you do not have admin rights you could try using virtualenv.""") + +nlp = imp.load_source('nlp', 'steps/nnet3/report/nnet3_log_parse_lib.py') + +logger = logging.getLogger(__name__) +logger.setLevel(logging.INFO) +handler = logging.StreamHandler() +handler.setLevel(logging.INFO) +formatter = logging.Formatter('%(asctime)s [%(filename)s:%(lineno)s - %(funcName)s - %(levelname)s ] %(message)s') +handler.setFormatter(formatter) +logger.addHandler(handler) +logger.info('Generating plots') + + + + +def GetArgs(): + parser = argparse.ArgumentParser(description=""" +Parses the training logs and generates a variety of plots. +example : steps/nnet3/report/generate_plots.py --comparison-dir exp/nnet3/tdnn1 --comparison-dir exp/nnet3/tdnn2 exp/nnet3/tdnn exp/nnet3/tdnn/report +""") + parser.add_argument("--comparison-dir", type=str, action='append', help="other experiment directories for comparison. These will only be used for plots, not tables") + parser.add_argument("--start-iter", type=int, help="Iteration from which plotting will start", default = 1) + parser.add_argument("--is-chain", type=str, default = False, action = train_lib.StrToBoolAction, help="Iteration from which plotting will start") + parser.add_argument("exp_dir", help="experiment directory, e.g. exp/nnet3/tdnn") + parser.add_argument("output_dir", help="experiment directory, e.g. exp/nnet3/tdnn/report") + + args = parser.parse_args() + if args.comparison_dir is not None and len(args.comparison_dir) > 6: + raise Exception("max 6 --comparison-dir options can be specified. If you want to compare with more comparison_dir, you would have to carefully tune the plot_colors variable which specified colors used for plotting.") + assert(args.start_iter >= 1) + return args + +plot_colors = ['red', 'blue', 'green', 'black', 'magenta', 'yellow', 'cyan' ] + + + +class LatexReport: + def __init__(self, pdf_file): + self.pdf_file = pdf_file + self.document=[] + self.document.append(""" +\documentclass[prl,10pt,twocolumn]{revtex4} +\usepackage{graphicx} % Used to import the graphics +\\begin{document} +""") + + def AddFigure(self, figure_pdf, title): + # we will have keep extending this replacement list based on errors during compilation + # escaping underscores in the title + title = "\\texttt{"+re.sub("_","\_", title)+"}" + fig_latex = """ +%... +\\newpage +\\begin{figure}[h] + \\begin{center} + \caption{""" + title + """} + \includegraphics[width=\\textwidth]{""" + figure_pdf + """} + \end{center} +\end{figure} +\clearpage +%... +""" + self.document.append(fig_latex) + + def Close(self): + self.document.append("\end{document}") + return self.Compile() + + def Compile(self): + root, ext = os.path.splitext(self.pdf_file) + dir_name = os.path.dirname(self.pdf_file) + latex_file = root + ".tex" + lat_file = open(latex_file, "w") + lat_file.write("\n".join(self.document)) + lat_file.close() + logger.info("Compiling the latex report.") + try: + proc = subprocess.Popen(['pdflatex', '-output-directory='+str(dir_name), latex_file], stdout=subprocess.PIPE, stderr=subprocess.PIPE) + proc.communicate() + except Exception as e: + logger.warning("There was an error compiling the latex file {0}, please do it manually.".format(latex_file)) + return False + return True + +def GenerateAccuracyPlots(exp_dir, output_dir, plot, key = 'accuracy', file_basename = 'accuracy', comparison_dir = None, start_iter = 1, latex_report = None): + assert(start_iter >= 1) + + if plot: + fig = plt.figure() + plots = [] + + comparison_dir = [] if comparison_dir is None else comparison_dir + dirs = [exp_dir] + comparison_dir + index = 0 + for dir in dirs: + [accuracy_report, accuracy_times, accuracy_data] = nlp.GenerateAccuracyReport(dir, key) + if index == 0: + # this is the main experiment directory + acc_file = open("{0}/{1}.log".format(output_dir, file_basename), "w") + acc_file.write(accuracy_report) + acc_file.close() + + if plot: + color_val = plot_colors[index] + data = np.array(accuracy_data) + if data.shape[0] == 0: + raise Exception("Couldn't find any rows for the accuracy plot") + data = data[data[:,0]>=start_iter, :] + plot_handle, = plt.plot(data[:, 0], data[:, 1], color = color_val, linestyle = "--", label = "train {0}".format(dir)) + plots.append(plot_handle) + plot_handle, = plt.plot(data[:, 0], data[:, 2], color = color_val, label = "valid {0}".format(dir)) + plots.append(plot_handle) + index += 1 + if plot: + plt.xlabel('Iteration') + plt.ylabel(key) + lgd = plt.legend(handles=plots, loc='lower center', bbox_to_anchor=(0.5, -0.2 + len(dirs) * -0.1 ), ncol=1, borderaxespad=0.) + plt.grid(True) + fig.suptitle("{0} plot".format(key)) + figfile_name = '{0}/{1}.pdf'.format(output_dir, file_basename) + plt.savefig(figfile_name, bbox_extra_artists=(lgd,), bbox_inches='tight') + if latex_report is not None: + latex_report.AddFigure(figfile_name, "Plot of {0} vs iterations".format(key)) + +def GenerateNonlinStatsPlots(exp_dir, output_dir, plot, comparison_dir = None, start_iter = 1, latex_report = None): + assert(start_iter >= 1) + + comparison_dir = [] if comparison_dir is None else comparison_dir + dirs = [exp_dir] + comparison_dir + index = 0 + stats_per_dir = {} + + for dir in dirs: + stats_per_component_per_iter = nlp.ParseProgressLogsForNonlinearityStats(dir) + stats_per_dir[dir] = stats_per_component_per_iter + + # convert the nonlin stats into tables + stat_tables_per_component_per_dir = {} + for dir in dirs: + stats_per_component_per_iter = stats_per_dir[dir] + component_names = stats_per_component_per_iter.keys() + stat_tables_per_component = {} + for component_name in component_names: + comp_data = stats_per_component_per_iter[component_name] + comp_type = comp_data['type'] + comp_stats = comp_data['stats'] + iters = comp_stats.keys() + iters.sort() + iter_stats = [] + for iter in iters: + iter_stats.append([iter] + comp_stats[iter]) + stat_tables_per_component[component_name] = iter_stats + stat_tables_per_component_per_dir[dir] = stat_tables_per_component + + main_stat_tables = stat_tables_per_component_per_dir[exp_dir] + for component_name in main_stat_tables.keys(): + # this is the main experiment directory + file = open("{dir}/nonlinstats_{comp_name}.log".format(dir = output_dir, comp_name = component_name), "w") + file.write("Iteration\tValueMean\tValueStddev\tDerivMean\tDerivStddev\n") + iter_stat_report = "" + iter_stats = main_stat_tables[component_name] + for row in iter_stats: + iter_stat_report += "\t".join(map(lambda x: str(x), row)) + "\n" + file.write(iter_stat_report) + file.close() + + if plot: + main_component_names = main_stat_tables.keys() + main_component_names.sort() + + plot_component_names = set(main_component_names) + for dir in dirs: + component_names = set(stats_per_dir[dir].keys()) + plot_component_names = plot_component_names.intersection(component_names) + plot_component_names = list(plot_component_names) + plot_component_names.sort() + if plot_component_names != main_component_names: + logger.warning("The components in all the neural networks in the given experiment dirs are not the same, so comparison plots are provided only for common component names. Make sure that these are comparable experiments before analyzing these plots.") + + fig = plt.figure() + for component_name in main_component_names: + fig.clf() + index = 0 + plots = [] + for dir in dirs: + color_val = plot_colors[index] + index += 1 + try: + iter_stats = stat_tables_per_component_per_dir[dir][component_name] + except KeyError: + # this component is not available in this network so lets not just plot it + continue + + data = np.array(iter_stats) + data = data[data[:,0] >=start_iter, :] + ax = plt.subplot(211) + mp, = ax.plot(data[:,0], data[:,1], color=color_val, label="Mean {0}".format(dir)) + msph, = ax.plot(data[:,0], data[:,1] + data[:,2], color=color_val, linestyle='--', label = "Mean+-Stddev {0}".format(dir)) + mspl, = ax.plot(data[:,0], data[:,1] - data[:,2], color=color_val, linestyle='--') + plots.append(mp) + plots.append(msph) + ax.set_ylabel('Value-{0}'.format(comp_type)) + ax.grid(True) + + ax = plt.subplot(212) + mp, = ax.plot(data[:,0], data[:,3], color=color_val) + msph, = ax.plot(data[:,0], data[:,3] + data[:,4], color=color_val, linestyle='--') + mspl, = ax.plot(data[:,0], data[:,3] - data[:,4], color=color_val, linestyle='--') + ax.set_xlabel('Iteration') + ax.set_ylabel('Derivative-{0}'.format(comp_type)) + ax.grid(True) + + lgd = plt.legend(handles=plots, loc='lower center', bbox_to_anchor=(0.5, -0.5 + len(dirs) * -0.2 ), ncol=1, borderaxespad=0.) + plt.grid(True) + fig.suptitle("Mean and stddev of the value and derivative at {comp_name}".format(comp_name = component_name)) + figfile_name = '{dir}/nonlinstats_{comp_name}.pdf'.format(dir = output_dir, comp_name = component_name) + fig.savefig(figfile_name, bbox_extra_artists=(lgd,), bbox_inches='tight') + if latex_report is not None: + latex_report.AddFigure(figfile_name, "Mean and stddev of the value and derivative at {0}".format(component_name)) + +def GeneratePlots(exp_dir, output_dir, comparison_dir = None, start_iter = 1, is_chain = False): + try: + os.makedirs(output_dir) + except OSError as e: + if e.errno == errno.EEXIST and os.path.isdir(output_dir): + pass + else: + raise e + if plot: + latex_report = LatexReport("{0}/report.pdf".format(output_dir)) + else: + latex_report = None + + if is_chain: + logger.info("Generating log-probability plots") + GenerateAccuracyPlots(exp_dir, output_dir, plot, key = 'log-probability', file_basename = 'log_probability', comparison_dir = comparison_dir, start_iter = start_iter, latex_report = latex_report) + else: + logger.info("Generating accuracy plots") + GenerateAccuracyPlots(exp_dir, output_dir, plot, key = 'accuracy', file_basename = 'accuracy', comparison_dir = comparison_dir, start_iter = start_iter, latex_report = latex_report) + + logger.info("Generating log-likelihood plots") + GenerateAccuracyPlots(exp_dir, output_dir, plot, key = 'log-likelihood', file_basename = 'loglikelihood', comparison_dir = comparison_dir, start_iter = start_iter, latex_report = latex_report) + + logger.info("Generating non-linearity stats plots") + GenerateNonlinStatsPlots(exp_dir, output_dir, plot, comparison_dir = comparison_dir, start_iter = start_iter, latex_report = latex_report) + + logger.info("Generating parameter difference files") + # Parameter changes + key_file = {"Parameter differences":"parameter.diff", + "Relative parameter differences":"relative_parameter.diff"} + for key in key_file.keys(): + file = open("{0}/{1}".format(output_dir, key_file[key]), "w") + data = nlp.ParseProgressLogsForParamDiff(exp_dir, key) + for row in data: + file.write(" ".join(map(lambda x:str(x),row))+"\n") + file.close() + if plot and latex_report is not None: + has_compiled = latex_report.Close() + if has_compiled: + logger.info("Report has been generated. You can find it at the location {0}".format("{0}/report.pdf".format(output_dir))) + +def Main(): + args = GetArgs() + GeneratePlots(args.exp_dir, args.output_dir, + comparison_dir = args.comparison_dir, + start_iter = args.start_iter, + is_chain = args.is_chain) + +if __name__ == "__main__": + Main() diff --git a/egs/wsj/s5/steps/nnet3/report/nnet3_log_parse_lib.py b/egs/wsj/s5/steps/nnet3/report/nnet3_log_parse_lib.py new file mode 100755 index 00000000000..2268fbadd72 --- /dev/null +++ b/egs/wsj/s5/steps/nnet3/report/nnet3_log_parse_lib.py @@ -0,0 +1,154 @@ +# Copyright 2016 Vijayaditya Peddinti. +# Apache 2.0. + +from __future__ import division +import sys, glob, re, math, datetime, argparse +import imp +import datetime as dt + +ntl = imp.load_source('ntl', 'steps/nnet3/nnet3_train_lib.py') + +#exp/nnet3/lstm_self_repair_ld5_sp/log/progress.9.log:component name=Lstm3_i type=SigmoidComponent, dim=1280, self-repair-scale=1e-05, count=1.96e+05, value-avg=[percentiles(0,1,2,5 10,20,50,80,90 95,98,99,100)=(0.05,0.09,0.11,0.15 0.19,0.27,0.50,0.72,0.83 0.88,0.92,0.94,0.99), mean=0.502, stddev=0.23], deriv-avg=[percentiles(0,1,2,5 10,20,50,80,90 95,98,99,100)=(0.009,0.04,0.05,0.06 0.08,0.10,0.14,0.17,0.18 0.19,0.20,0.20,0.21), mean=0.134, stddev=0.0397] +def ParseProgressLogsForNonlinearityStats(exp_dir): + progress_log_files = "%s/log/progress.*.log" % (exp_dir) + stats_per_component_per_iter = {} + progress_log_lines = ntl.RunKaldiCommand('grep -e "value-avg.*deriv-avg" {0}'.format(progress_log_files), measure_time = False)[0] + parse_regex = re.compile(".*progress.([0-9]+).log:component name=(.+) type=(.*)Component,.*value-avg=\[.*mean=([0-9\.\-e]+), stddev=([0-9\.e\-]+)\].*deriv-avg=\[.*mean=([0-9\.\-e]+), stddev=([0-9\.e\-]+)\]") + for line in progress_log_lines.split("\n") : + mat_obj = parse_regex.search(line) + if mat_obj is None: + continue + groups = mat_obj.groups() + # groups = ('9', 'Lstm3_i', 'Sigmoid', '0.502', '0.23', '0.134', '0.0397') + iteration = int(groups[0]) + component_name = groups[1] + component_type = groups[2] + value_mean = float(groups[3]) + value_stddev = float(groups[4]) + deriv_mean = float(groups[5]) + deriv_stddev = float(groups[6]) + try: + stats_per_component_per_iter[component_name]['stats'][iteration] = [value_mean, value_stddev, deriv_mean, deriv_stddev] + except KeyError: + stats_per_component_per_iter[component_name] = {} + stats_per_component_per_iter[component_name]['type'] = component_type + stats_per_component_per_iter[component_name]['stats'] = {} + stats_per_component_per_iter[component_name]['stats'][iteration] = [value_mean, value_stddev, deriv_mean, deriv_stddev] + + return stats_per_component_per_iter + +def ParseDifferenceString(string): + dict = {} + for parts in string.split(): + sub_parts = parts.split(":") + dict[sub_parts[0]] = float(sub_parts[1]) + return dict + +#exp/chain/cwrnn_trial2_ld5_sp/log/progress.245.log:LOG (nnet3-show-progress:main():nnet3-show-progress.cc:144) Relative parameter differences per layer are [ Cwrnn1_T3_W_r:0.0171537 Cwrnn1_T3_W_x:1.33338e-07 Cwrnn1_T2_W_r:0.048075 Cwrnn1_T2_W_x:1.34088e-07 Cwrnn1_T1_W_r:0.0157277 Cwrnn1_T1_W_x:0.0212704 Final_affine:0.0321521 Cwrnn2_T3_W_r:0.0212082 Cwrnn2_T3_W_x:1.33691e-07 Cwrnn2_T2_W_r:0.0212978 Cwrnn2_T2_W_x:1.33401e-07 Cwrnn2_T1_W_r:0.014976 Cwrnn2_T1_W_x:0.0233588 Cwrnn3_T3_W_r:0.0237165 Cwrnn3_T3_W_x:1.33184e-07 Cwrnn3_T2_W_r:0.0239754 Cwrnn3_T2_W_x:1.3296e-07 Cwrnn3_T1_W_r:0.0194809 Cwrnn3_T1_W_x:0.0271934 ] +def ParseProgressLogsForParamDiff(exp_dir, pattern): + if pattern not in set(["Relative parameter differences", "Parameter differences"]): + raise Exception("Unknown value for pattern : {0}".format(pattern)) + + progress_log_files = "%s/log/progress.*.log" % (exp_dir) + progress_per_iter = {} + component_names = set([]) + progress_log_lines = ntl.RunKaldiCommand('grep -e "{0}" {1}'.format(pattern, progress_log_files), measure_time = False)[0] + parse_regex = re.compile(".*progress\.([0-9]+)\.log:LOG.*{0}.*\[(.*)\]".format(pattern)) + for line in progress_log_lines.split("\n") : + mat_obj = parse_regex.search(line) + if mat_obj is None: + continue + groups = mat_obj.groups() + iteration = groups[0] + differences = ParseDifferenceString(groups[1]) + component_names = component_names.union(differences.keys()) + progress_per_iter[int(iteration)] = differences + + component_names = list(component_names) + component_names.sort() + # rearranging the data into an array + data = [] + data.append(["iteration"]+component_names) + max_iter = max(progress_per_iter.keys()) + for iter in range(max_iter + 1): + try: + component_dict = progress_per_iter[iter] + except KeyError: + continue + iter_values = [] + for component_name in component_names: + try: + iter_values.append(component_dict[component_name]) + except KeyError: + # the component was not found this iteration, may be because of layerwise discriminative training + iter_values.append(0) + data.append([iter] + iter_values) + + return data + +def ParseTrainLogs(exp_dir): + train_log_files = "%s/log/train.*.log" % (exp_dir) + train_log_lines = ntl.RunKaldiCommand('grep -e Accounting {0}'.format(train_log_files), measure_time = False)[0] + parse_regex = re.compile(".*train\.([0-9]+)\.([0-9]+)\.log:# Accounting: time=([0-9]+) thread.*") + + train_times = {} + for line in train_log_lines.split('\n'): + mat_obj = parse_regex.search(line) + if mat_obj is not None: + groups = mat_obj.groups() + try: + train_times[int(groups[0])][int(groups[1])] = float(groups[2]) + except KeyError: + train_times[int(groups[0])] = {} + train_times[int(groups[0])][int(groups[1])] = float(groups[2]) + iters = train_times.keys() + for iter in iters: + values = train_times[iter].values() + train_times[iter] = max(values) + return train_times + +def ParseProbLogs(exp_dir, key = 'accuracy'): + train_prob_files = "%s/log/compute_prob_train.*.log" % (exp_dir) + valid_prob_files = "%s/log/compute_prob_valid.*.log" % (exp_dir) + train_prob_strings = ntl.RunKaldiCommand('grep -e {0} {1}'.format(key, train_prob_files), wait = True, measure_time = False)[0] + valid_prob_strings = ntl.RunKaldiCommand('grep -e {0} {1}'.format(key, valid_prob_files), measure_time = False)[0] + + #LOG (nnet3-chain-compute-prob:PrintTotalStats():nnet-chain-diagnostics.cc:149) Overall log-probability for 'output' is -0.399395 + -0.013437 = -0.412832 per frame, over 20000 fra + #LOG (nnet3-chain-compute-prob:PrintTotalStats():nnet-chain-diagnostics.cc:144) Overall log-probability for 'output' is -0.307255 per frame, over 20000 frames. + parse_regex = re.compile(".*compute_prob_.*\.([0-9]+).log:LOG .nnet3.*compute-prob:PrintTotalStats..:nnet.*diagnostics.cc:[0-9]+. Overall ([a-zA-Z\-]+) for 'output'.*is ([0-9.\-e]+) .*per frame") + train_loss={} + valid_loss={} + + + for line in train_prob_strings.split('\n'): + mat_obj = parse_regex.search(line) + if mat_obj is not None: + groups = mat_obj.groups() + if groups[1] == key: + train_loss[int(groups[0])] = groups[2] + for line in valid_prob_strings.split('\n'): + mat_obj = parse_regex.search(line) + if mat_obj is not None: + groups = mat_obj.groups() + if groups[1] == key: + valid_loss[int(groups[0])] = groups[2] + iters = list(set(valid_loss.keys()).intersection(train_loss.keys())) + iters.sort() + return map(lambda x: (int(x), float(train_loss[x]), float(valid_loss[x])), iters) + +def GenerateAccuracyReport(exp_dir, key = "accuracy"): + times = ParseTrainLogs(exp_dir) + data = ParseProbLogs(exp_dir, key) + report = [] + report.append("%Iter\tduration\ttrain_loss\tvalid_loss\tdifference") + for x in data: + try: + report.append("%d\t%s\t%g\t%g\t%g" % (x[0], str(times[x[0]]), x[1], x[2], x[2]-x[1])) + except KeyError: + continue + + total_time = 0 + for iter in times.keys(): + total_time += times[iter] + report.append("Total training time is {0}\n".format(str(datetime.timedelta(seconds = total_time)))) + return ["\n".join(report), times, data] diff --git a/egs/wsj/s5/steps/nnet3/tdnn/make_configs.py b/egs/wsj/s5/steps/nnet3/tdnn/make_configs.py new file mode 100755 index 00000000000..5290a4c1abe --- /dev/null +++ b/egs/wsj/s5/steps/nnet3/tdnn/make_configs.py @@ -0,0 +1,638 @@ +#!/usr/bin/env python + +# we're using python 3.x style print but want it to work in python 2.x, +from __future__ import print_function +import os +import argparse +import shlex +import sys +import warnings +import copy +import imp +import ast + +nodes = imp.load_source('', 'steps/nnet3/components.py') +nnet3_train_lib = imp.load_source('ntl', 'steps/nnet3/nnet3_train_lib.py') +chain_lib = imp.load_source('ncl', 'steps/nnet3/chain/nnet3_chain_lib.py') + +def GetArgs(): + # we add compulsary arguments as named arguments for readability + parser = argparse.ArgumentParser(description="Writes config files and variables " + "for TDNNs creation and training", + epilog="See steps/nnet3/tdnn/train.sh for example.") + + # Only one of these arguments can be specified, and one of them has to + # be compulsarily specified + feat_group = parser.add_mutually_exclusive_group(required = True) + feat_group.add_argument("--feat-dim", type=int, + help="Raw feature dimension, e.g. 13") + feat_group.add_argument("--feat-dir", type=str, + help="Feature directory, from which we derive the feat-dim") + + # only one of these arguments can be specified + ivector_group = parser.add_mutually_exclusive_group(required = False) + ivector_group.add_argument("--ivector-dim", type=int, + help="iVector dimension, e.g. 100", default=0) + ivector_group.add_argument("--ivector-dir", type=str, + help="iVector dir, which will be used to derive the ivector-dim ", default=None) + + num_target_group = parser.add_mutually_exclusive_group(required = True) + num_target_group.add_argument("--num-targets", type=int, + help="number of network targets (e.g. num-pdf-ids/num-leaves)") + num_target_group.add_argument("--ali-dir", type=str, + help="alignment directory, from which we derive the num-targets") + num_target_group.add_argument("--tree-dir", type=str, + help="directory with final.mdl, from which we derive the num-targets") + + # CNN options + parser.add_argument('--cnn.layer', type=str, action='append', dest = "cnn_layer", + help="CNN parameters at each CNN layer, e.g. --filt-x-dim=3 --filt-y-dim=8 " + "--filt-x-step=1 --filt-y-step=1 --num-filters=256 --pool-x-size=1 --pool-y-size=3 " + "--pool-z-size=1 --pool-x-step=1 --pool-y-step=3 --pool-z-step=1, " + "when CNN layers are used, no LDA will be added", default = None) + parser.add_argument("--cnn.bottleneck-dim", type=int, dest = "cnn_bottleneck_dim", + help="Output dimension of the linear layer at the CNN output " + "for dimension reduction, e.g. 256." + "The default zero means this layer is not needed.", default=0) + parser.add_argument("--cnn.cepstral-lifter", type=float, dest = "cepstral_lifter", + help="The factor used for determining the liftering vector in the production of MFCC. " + "User has to ensure that it matches the lifter used in MFCC generation, " + "e.g. 22.0", default=22.0) + + # General neural network options + parser.add_argument("--splice-indexes", type=str, required = True, + help="Splice indexes at each layer, e.g. '-3,-2,-1,0,1,2,3' " + "If CNN layers are used the first set of splice indexes will be used as input " + "to the first CNN layer and later splice indexes will be interpreted as indexes " + "for the TDNNs.") + parser.add_argument("--add-lda", type=str, action=nnet3_train_lib.StrToBoolAction, + help="If \"true\" an LDA matrix computed from the input features " + "(spliced according to the first set of splice-indexes) will be used as " + "the first Affine layer. This affine layer's parameters are fixed during training. " + "If --cnn.layer is specified this option will be forced to \"false\".", + default=True, choices = ["false", "true"]) + + parser.add_argument("--include-log-softmax", type=str, action=nnet3_train_lib.StrToBoolAction, + help="add the final softmax layer ", default=True, choices = ["false", "true"]) + parser.add_argument("--add-final-sigmoid", type=str, action=nnet3_train_lib.StrToBoolAction, + help="add a final sigmoid layer as alternate to log-softmax-layer. " + "Can only be used if include-log-softmax is false. " + "This is useful in cases where you want the output to be " + "like probabilities between 0 and 1. Typically the nnet " + "is trained with an objective such as quadratic", + default=False, choices = ["false", "true"]) + + parser.add_argument("--objective-type", type=str, + help = "the type of objective; i.e. quadratic or linear", + default="linear", choices = ["linear", "quadratic"]) + parser.add_argument("--xent-regularize", type=float, + help="For chain models, if nonzero, add a separate output for cross-entropy " + "regularization (with learning-rate-factor equal to the inverse of this)", + default=0.0) + parser.add_argument("--xent-separate-forward-affine", type=str, action=nnet3_train_lib.StrToBoolAction, + help="if using --xent-regularize, gives it separate last-but-one weight matrix", + default=False, choices = ["false", "true"]) + parser.add_argument("--final-layer-normalize-target", type=float, + help="RMS target for final layer (set to <1 if final layer learns too fast", + default=1.0) + parser.add_argument("--subset-dim", type=int, default=0, + help="dimension of the subset of units to be sent to the central frame") + parser.add_argument("--pnorm-input-dim", type=int, + help="input dimension to p-norm nonlinearities") + parser.add_argument("--pnorm-output-dim", type=int, + help="output dimension of p-norm nonlinearities") + parser.add_argument("--relu-dim", type=int, + help="dimension of ReLU nonlinearities") + + parser.add_argument("--self-repair-scale", type=float, + help="A non-zero value activates the self-repair mechanism in the sigmoid and tanh non-linearities of the LSTM", default=None) + + + parser.add_argument("--pool-type", type=str, default = 'none', + help="Type of pooling to be used.", choices = ['low-pass', 'weighted-average', 'per-dim-weighted-average', 'multi-dim-weighted-average', 'none']) + parser.add_argument("--pool-window", type=int, default = None, + help="Width of the pooling window") + parser.add_argument("--pool-lpfilter-width", type=float, + default = None, help="Nyquist frequency of the lpfilter to be used for pooling") + parser.add_argument("--use-presoftmax-prior-scale", type=str, action=nnet3_train_lib.StrToBoolAction, + help="if true, a presoftmax-prior-scale is added", + choices=['true', 'false'], default = True) + parser.add_argument("config_dir", + help="Directory to write config files and variables") + + print(' '.join(sys.argv)) + + args = parser.parse_args() + args = CheckArgs(args) + + return args + +def CheckArgs(args): + if not os.path.exists(args.config_dir): + os.makedirs(args.config_dir) + + ## Check arguments. + if args.feat_dir is not None: + args.feat_dim = nnet3_train_lib.GetFeatDim(args.feat_dir) + + if args.ali_dir is not None: + args.num_targets = nnet3_train_lib.GetNumberOfLeaves(args.ali_dir) + elif args.tree_dir is not None: + args.num_targets = chain_lib.GetNumberOfLeaves(args.tree_dir) + + if args.ivector_dir is not None: + args.ivector_dim = nnet3_train_lib.GetIvectorDim(args.ivector_dir) + + if not args.feat_dim > 0: + raise Exception("feat-dim has to be postive") + + if not args.num_targets > 0: + print(args.num_targets) + raise Exception("num_targets has to be positive") + + if not args.ivector_dim >= 0: + raise Exception("ivector-dim has to be non-negative") + + if (args.subset_dim < 0): + raise Exception("--subset-dim has to be non-negative") + if (args.pool_window is not None) and (args.pool_window <= 0): + raise Exception("--pool-window has to be positive") + + if not args.relu_dim is None: + if not args.pnorm_input_dim is None or not args.pnorm_output_dim is None: + raise Exception("--relu-dim argument not compatible with " + "--pnorm-input-dim or --pnorm-output-dim options"); + args.nonlin_input_dim = args.relu_dim + args.nonlin_output_dim = args.relu_dim + else: + if not args.pnorm_input_dim > 0 or not args.pnorm_output_dim > 0: + raise Exception("--relu-dim not set, so expected --pnorm-input-dim and " + "--pnorm-output-dim to be provided."); + args.nonlin_input_dim = args.pnorm_input_dim + args.nonlin_output_dim = args.pnorm_output_dim + + if args.add_final_sigmoid and args.include_log_softmax: + raise Exception("--include-log-softmax and --add-final-sigmoid cannot both be true.") + + if args.xent_separate_forward_affine and args.add_final_sigmoid: + raise Exception("It does not make sense to have --add-final-sigmoid=true when xent-separate-forward-affine is true") + + if args.add_lda and args.cnn_layer is not None: + args.add_lda = False + warnings.warn("--add-lda is set to false as CNN layers are used.") + + return args + + +def AddMultiDimAffineLayer(config_lines, name, input, input_window, block_input_dim, block_output_dim): + assert(block_input_dim % input_window== 0) + filter_context = int((input_window - 1) / 2) + filter_input_splice_indexes = range(-1 * filter_context, filter_context + 1) + list = [('Offset({0}, {1})'.format(input['descriptor'], n) if n != 0 else input['descriptor']) for n in filter_input_splice_indexes] + filter_input_descriptor = 'Append({0})'.format(' , '.join(list)) + filter_input_descriptor = {'descriptor':filter_input_descriptor, + 'dimension':len(filter_input_splice_indexes) * input['dimension']} + + + # add permute component to shuffle the feature columns of the Append + # descriptor output so that columns corresponding to the same feature index + # are contiguous add a block-affine component to collapse all the feature + # indexes across time steps into a single value + num_feats = input['dimension'] + num_times = len(filter_input_splice_indexes) + column_map = [] + for i in range(num_feats): + for j in range(num_times): + column_map.append(j * num_feats + i) + permuted_output_descriptor = nodes.AddPermuteLayer(config_lines, + name, filter_input_descriptor, column_map) + # add a block-affine component + output_descriptor = nodes.AddBlockAffineLayer(config_lines, name, + permuted_output_descriptor, + num_feats / (block_input_dim / input_window) * block_output_dim, num_feats / (block_input_dim/ input_window)) + + return [output_descriptor, filter_context, filter_context] + +def AddMultiDimAffineLayer(config_lines, name, input, input_window, block_input_dim, block_output_dim): + assert(block_input_dim % input_window== 0) + filter_context = int((input_window - 1) / 2) + filter_input_splice_indexes = range(-1 * filter_context, filter_context + 1) + list = [('Offset({0}, {1})'.format(input['descriptor'], n) if n != 0 else input['descriptor']) for n in filter_input_splice_indexes] + filter_input_descriptor = 'Append({0})'.format(' , '.join(list)) + filter_input_descriptor = {'descriptor':filter_input_descriptor, + 'dimension':len(filter_input_splice_indexes) * input['dimension']} + + + # add permute component to shuffle the feature columns of the Append + # descriptor output so that columns corresponding to the same feature index + # are contiguous add a block-affine component to collapse all the feature + # indexes across time steps into a single value + num_feats = input['dimension'] + num_times = len(filter_input_splice_indexes) + column_map = [] + for i in range(num_feats): + for j in range(num_times): + column_map.append(j * num_feats + i) + permuted_output_descriptor = nodes.AddPermuteLayer(config_lines, + name, filter_input_descriptor, column_map) + # add a block-affine component + output_descriptor = nodes.AddBlockAffineLayer(config_lines, name, + permuted_output_descriptor, + num_feats / (block_input_dim / input_window) * block_output_dim, num_feats / (block_input_dim/ input_window)) + + return [output_descriptor, filter_context, filter_context] + +def AddLpFilter(config_lines, name, input, rate, num_lpfilter_taps, lpfilt_filename, is_updatable = False): + try: + import scipy.signal as signal + import numpy as np + except ImportError: + raise Exception(" This recipe cannot be run without scipy." + " You can install it using the command \n" + " pip install scipy\n" + " If you do not have admin access on the machine you are" + " trying to run this recipe, you can try using" + " virtualenv") + # low-pass smoothing of input was specified. so we will add a low-pass filtering layer + lp_filter = signal.firwin(num_lpfilter_taps, rate, width=None, window='hamming', pass_zero=True, scale=True, nyq=1.0) + lp_filter = list(np.append(lp_filter, 0)) + nnet3_train_lib.WriteKaldiMatrix(lpfilt_filename, [lp_filter]) + filter_context = int((num_lpfilter_taps - 1) / 2) + filter_input_splice_indexes = range(-1 * filter_context, filter_context + 1) + list = [('Offset({0}, {1})'.format(input['descriptor'], n) if n != 0 else input['descriptor']) for n in filter_input_splice_indexes] + filter_input_descriptor = 'Append({0})'.format(' , '.join(list)) + filter_input_descriptor = {'descriptor':filter_input_descriptor, + 'dimension':len(filter_input_splice_indexes) * input['dimension']} + + input_x_dim = len(filter_input_splice_indexes) + input_y_dim = input['dimension'] + input_z_dim = 1 + filt_x_dim = len(filter_input_splice_indexes) + filt_y_dim = 1 + filt_x_step = 1 + filt_y_step = 1 + input_vectorization = 'zyx' + + tdnn_input_descriptor = nodes.AddConvolutionLayer(config_lines, name, + filter_input_descriptor, + input_x_dim, input_y_dim, input_z_dim, + filt_x_dim, filt_y_dim, + filt_x_step, filt_y_step, + 1, input_vectorization, + filter_bias_file = lpfilt_filename, + is_updatable = is_updatable) + + + return [tdnn_input_descriptor, filter_context, filter_context] + +def AddConvMaxpLayer(config_lines, name, input, args): + if '3d-dim' not in input: + raise Exception("The input to AddConvMaxpLayer() needs '3d-dim' parameters.") + + input = nodes.AddConvolutionLayer(config_lines, name, input, + input['3d-dim'][0], input['3d-dim'][1], input['3d-dim'][2], + args.filt_x_dim, args.filt_y_dim, + args.filt_x_step, args.filt_y_step, + args.num_filters, input['vectorization']) + + if args.pool_x_size > 1 or args.pool_y_size > 1 or args.pool_z_size > 1: + input = nodes.AddMaxpoolingLayer(config_lines, name, input, + input['3d-dim'][0], input['3d-dim'][1], input['3d-dim'][2], + args.pool_x_size, args.pool_y_size, args.pool_z_size, + args.pool_x_step, args.pool_y_step, args.pool_z_step) + + return input + +# The ivectors are processed through an affine layer parallel to the CNN layers, +# then concatenated with the CNN output and passed to the deeper part of the network. +def AddCnnLayers(config_lines, cnn_layer, cnn_bottleneck_dim, cepstral_lifter, config_dir, feat_dim, splice_indexes=[0], ivector_dim=0): + cnn_args = ParseCnnString(cnn_layer) + num_cnn_layers = len(cnn_args) + # We use an Idct layer here to convert MFCC to FBANK features + nnet3_train_lib.WriteIdctMatrix(feat_dim, cepstral_lifter, config_dir.strip() + "/idct.mat") + prev_layer_output = {'descriptor': "input", + 'dimension': feat_dim} + prev_layer_output = nodes.AddFixedAffineLayer(config_lines, "Idct", prev_layer_output, config_dir.strip() + '/idct.mat') + + list = [('Offset({0}, {1})'.format(prev_layer_output['descriptor'],n) if n != 0 else prev_layer_output['descriptor']) for n in splice_indexes] + splice_descriptor = "Append({0})".format(", ".join(list)) + cnn_input_dim = len(splice_indexes) * feat_dim + prev_layer_output = {'descriptor': splice_descriptor, + 'dimension': cnn_input_dim, + '3d-dim': [len(splice_indexes), feat_dim, 1], + 'vectorization': 'yzx'} + + for cl in range(0, num_cnn_layers): + prev_layer_output = AddConvMaxpLayer(config_lines, "L{0}".format(cl), prev_layer_output, cnn_args[cl]) + + if cnn_bottleneck_dim > 0: + prev_layer_output = nodes.AddAffineLayer(config_lines, "cnn-bottleneck", prev_layer_output, cnn_bottleneck_dim, "") + + if ivector_dim > 0: + iv_layer_output = {'descriptor': 'ReplaceIndex(ivector, t, 0)', + 'dimension': ivector_dim} + iv_layer_output = nodes.AddAffineLayer(config_lines, "ivector", iv_layer_output, ivector_dim, "") + prev_layer_output['descriptor'] = 'Append({0}, {1})'.format(prev_layer_output['descriptor'], iv_layer_output['descriptor']) + prev_layer_output['dimension'] = prev_layer_output['dimension'] + iv_layer_output['dimension'] + + return prev_layer_output + +def PrintConfig(file_name, config_lines): + f = open(file_name, 'w') + f.write("\n".join(config_lines['components'])+"\n") + f.write("\n#Component nodes\n") + f.write("\n".join(config_lines['component-nodes'])) + f.close() + +def ParseCnnString(cnn_param_string_list): + cnn_parser = argparse.ArgumentParser(description="cnn argument parser") + + cnn_parser.add_argument("--filt-x-dim", required=True, type=int) + cnn_parser.add_argument("--filt-y-dim", required=True, type=int) + cnn_parser.add_argument("--filt-x-step", type=int, default = 1) + cnn_parser.add_argument("--filt-y-step", type=int, default = 1) + cnn_parser.add_argument("--num-filters", required=True, type=int) + cnn_parser.add_argument("--pool-x-size", type=int, default = 1) + cnn_parser.add_argument("--pool-y-size", type=int, default = 1) + cnn_parser.add_argument("--pool-z-size", type=int, default = 1) + cnn_parser.add_argument("--pool-x-step", type=int, default = 1) + cnn_parser.add_argument("--pool-y-step", type=int, default = 1) + cnn_parser.add_argument("--pool-z-step", type=int, default = 1) + + cnn_args = [] + for cl in range(0, len(cnn_param_string_list)): + cnn_args.append(cnn_parser.parse_args(shlex.split(cnn_param_string_list[cl]))) + + return cnn_args + +def ParseSpliceString(splice_indexes): + splice_array = [] + left_context = 0 + right_context = 0 + split1 = splice_indexes.split(); # we already checked the string is nonempty. + if len(split1) < 1: + raise Exception("invalid splice-indexes argument, too short: " + + splice_indexes) + try: + for string in split1: + split2 = string.split(",") + if len(split2) < 1: + raise Exception("invalid splice-indexes argument, too-short element: " + + splice_indexes) + int_list = [] + for int_str in split2: + int_list.append(int(int_str)) + if not int_list == sorted(int_list): + raise Exception("elements of splice-indexes must be sorted: " + + splice_indexes) + left_context += -int_list[0] + right_context += int_list[-1] + splice_array.append(int_list) + except ValueError as e: + raise Exception("invalid splice-indexes argument " + splice_indexes + str(e)) + left_context = max(0, left_context) + right_context = max(0, right_context) + + return {'left_context':left_context, + 'right_context':right_context, + 'splice_indexes':splice_array, + 'num_hidden_layers':len(splice_array) + } + +# The function signature of MakeConfigs is changed frequently as it is intended for local use in this script. +def MakeConfigs(config_dir, splice_indexes_string, + cnn_layer, cnn_bottleneck_dim, cepstral_lifter, + feat_dim, ivector_dim, num_targets, add_lda, + nonlin_input_dim, nonlin_output_dim, subset_dim, + pool_type, pool_window, pool_lpfilter_width, + use_presoftmax_prior_scale, + final_layer_normalize_target, + include_log_softmax, + add_final_sigmoid, + xent_regularize, + xent_separate_forward_affine, + self_repair_scale, + objective_type): + + parsed_splice_output = ParseSpliceString(splice_indexes_string.strip()) + + left_context = parsed_splice_output['left_context'] + right_context = parsed_splice_output['right_context'] + num_hidden_layers = parsed_splice_output['num_hidden_layers'] + splice_indexes = parsed_splice_output['splice_indexes'] + input_dim = len(parsed_splice_output['splice_indexes'][0]) + feat_dim + ivector_dim + + if xent_separate_forward_affine: + if splice_indexes[-1] != [0]: + raise Exception("--xent-separate-forward-affine option is supported only if the last-hidden layer has no splicing before it. Please use a splice-indexes with just 0 as the final splicing config.") + + prior_scale_file = '{0}/presoftmax_prior_scale.vec'.format(config_dir) + + config_lines = {'components':[], 'component-nodes':[]} + + config_files={} + prev_layer_output = nodes.AddInputLayer(config_lines, feat_dim, splice_indexes[0], ivector_dim) + + # Add the init config lines for estimating the preconditioning matrices + init_config_lines = copy.deepcopy(config_lines) + init_config_lines['components'].insert(0, '# Config file for initializing neural network prior to') + init_config_lines['components'].insert(0, '# preconditioning matrix computation') + nodes.AddOutputLayer(init_config_lines, prev_layer_output) + config_files[config_dir + '/init.config'] = init_config_lines + + if cnn_layer is not None: + prev_layer_output = AddCnnLayers(config_lines, cnn_layer, cnn_bottleneck_dim, cepstral_lifter, config_dir, + feat_dim, splice_indexes[0], ivector_dim) + + if add_lda: + prev_layer_output = nodes.AddLdaLayer(config_lines, "L0", prev_layer_output, config_dir + '/lda.mat') + + left_context = 0 + right_context = 0 + # we moved the first splice layer to before the LDA.. + # so the input to the first affine layer is going to [0] index + splice_indexes[0] = [0] + + for i in range(0, num_hidden_layers): + # make the intermediate config file for layerwise discriminative training + # if specified, pool the input from the previous layer + + # prepare the spliced input + if not (len(splice_indexes[i]) == 1 and splice_indexes[i][0] == 0): + if pool_type != "none" and pool_window is None: + raise Exception("Pooling type was specified as {0}, this requires specification of the pool-window".format(pool_type)) + if pool_type in set(["low-pass", "weighted-average"]): + if pool_type == "weighted-average": + lpfilter_is_updatable = True + else: + lpfilter_is_updatable = False + # low-pass filter the input to smooth it before the sub-sampling + [prev_layer_output, cur_left_context, cur_right_context] = AddLpFilter(config_lines, + 'Tdnn_input_smoother_{0}'.format(i), + prev_layer_output, + pool_lpfilter_width, + pool_window, + config_dir + '/Tdnn_input_smoother_{0}.txt'.format(i), + is_updatable = lpfilter_is_updatable) + left_context += cur_left_context + right_context += cur_right_context + + elif pool_type == "per-dim-weighted-average": + # add permute component to shuffle the feature columns of the Append descriptor output so + # that columns corresponding to the same feature index are contiguous + # add a block-affine component to collapse all the feature indexes across time steps into a single value + [prev_layer_output, cur_left_context, cur_right_context] = nodes.AddPerDimAffineLayer(config_lines, + 'Tdnn_input_{0}'.format(i), + prev_layer_output, + pool_window) + + left_context += cur_left_context + right_context += cur_right_context + + elif pool_type == "multi-dim-weighted-average": + [prev_layer_output, cur_left_context, cur_right_context] = AddMultiDimAffineLayer(config_lines, + 'Tdnn_input_{0}'.format(i), + prev_layer_output, + pool_window, + 8 * pool_window, 8) + left_context += cur_left_context + right_context += cur_right_context + + + try: + zero_index = splice_indexes[i].index(0) + except ValueError: + zero_index = None + # I just assume the prev_layer_output_descriptor is a simple forwarding descriptor + prev_layer_output_descriptor = prev_layer_output['descriptor'] + subset_output = prev_layer_output + if subset_dim > 0: + # if subset_dim is specified the script expects a zero in the splice indexes + assert(zero_index is not None) + subset_node_config = "dim-range-node name=Tdnn_input_{0} input-node={1} dim-offset={2} dim={3}".format(i, prev_layer_output_descriptor, 0, subset_dim) + subset_output = {'descriptor' : 'Tdnn_input_{0}'.format(i), + 'dimension' : subset_dim} + config_lines['component-nodes'].append(subset_node_config) + appended_descriptors = [] + appended_dimension = 0 + for j in range(len(splice_indexes[i])): + if j == zero_index: + appended_descriptors.append(prev_layer_output['descriptor']) + appended_dimension += prev_layer_output['dimension'] + continue + appended_descriptors.append('Offset({0}, {1})'.format(subset_output['descriptor'], splice_indexes[i][j])) + appended_dimension += subset_output['dimension'] + prev_layer_output = {'descriptor' : "Append({0})".format(" , ".join(appended_descriptors)), + 'dimension' : appended_dimension} + else: + # this is a normal affine node + pass + + if xent_separate_forward_affine and i == num_hidden_layers - 1: + if xent_regularize == 0.0: + raise Exception("xent-separate-forward-affine=True is valid only if xent-regularize is non-zero") + + prev_layer_output_chain = nodes.AddAffRelNormLayer(config_lines, "Tdnn_pre_final_chain", + prev_layer_output, nonlin_output_dim, + self_repair_scale = self_repair_scale, + norm_target_rms = final_layer_normalize_target) + + + nodes.AddFinalLayer(config_lines, prev_layer_output_chain, num_targets, + use_presoftmax_prior_scale = use_presoftmax_prior_scale, + prior_scale_file = prior_scale_file, + include_log_softmax = include_log_softmax) + + + prev_layer_output_xent = nodes.AddAffRelNormLayer(config_lines, "Tdnn_pre_final_xent", + prev_layer_output, nonlin_output_dim, + self_repair_scale = self_repair_scale, + norm_target_rms = final_layer_normalize_target) + + nodes.AddFinalLayer(config_lines, prev_layer_output_xent, num_targets, + ng_affine_options = " param-stddev=0 bias-stddev=0 learning-rate-factor={0} ".format( + 0.5 / xent_regularize), + use_presoftmax_prior_scale = use_presoftmax_prior_scale, + prior_scale_file = prior_scale_file, + include_log_softmax = True, + name_affix = 'xent') + else: + prev_layer_output = nodes.AddAffRelNormLayer(config_lines, "Tdnn_{0}".format(i), + prev_layer_output, nonlin_output_dim, + self_repair_scale = self_repair_scale, + norm_target_rms = 1.0 if i < num_hidden_layers -1 else final_layer_normalize_target) + + # a final layer is added after each new layer as we are generating + # configs for layer-wise discriminative training + + # add_final_sigmoid adds a sigmoid as a final layer as alternative + # to log-softmax layer. + # http://ufldl.stanford.edu/wiki/index.php/Softmax_Regression#Softmax_Regression_vs._k_Binary_Classifiers + # This is useful when you need the final outputs to be probabilities between 0 and 1. + # Usually used with an objective-type such as "quadratic". + # Applications are k-binary classification such Ideal Ratio Mask prediction. + nodes.AddFinalLayer(config_lines, prev_layer_output, num_targets, + use_presoftmax_prior_scale = use_presoftmax_prior_scale, + prior_scale_file = prior_scale_file, + include_log_softmax = include_log_softmax, + add_final_sigmoid = add_final_sigmoid, + objective_type = objective_type) + if xent_regularize != 0.0: + nodes.AddFinalLayer(config_lines, prev_layer_output, num_targets, + ng_affine_options = " param-stddev=0 bias-stddev=0 learning-rate-factor={0} ".format( + 0.5 / xent_regularize), + use_presoftmax_prior_scale = use_presoftmax_prior_scale, + prior_scale_file = prior_scale_file, + include_log_softmax = True, + name_affix = 'xent') + + config_files['{0}/layer{1}.config'.format(config_dir, i+1)] = config_lines + config_lines = {'components':[], 'component-nodes':[]} + + left_context += int(parsed_splice_output['left_context']) + right_context += int(parsed_splice_output['right_context']) + + # write the files used by other scripts like steps/nnet3/get_egs.sh + f = open(config_dir + "/vars", "w") + print('model_left_context=' + str(left_context), file=f) + print('model_right_context=' + str(right_context), file=f) + print('num_hidden_layers=' + str(num_hidden_layers), file=f) + print('num_targets=' + str(num_targets), file=f) + print('add_lda=' + ('true' if add_lda else 'false'), file=f) + print('include_log_softmax=' + ('true' if include_log_softmax else 'false'), file=f) + print('objective_type=' + objective_type, file=f) + f.close() + + # printing out the configs + # init.config used to train lda-mllt train + for key in config_files.keys(): + PrintConfig(key, config_files[key]) + +def Main(): + args = GetArgs() + + MakeConfigs(config_dir = args.config_dir, + splice_indexes_string = args.splice_indexes, + feat_dim = args.feat_dim, ivector_dim = args.ivector_dim, + num_targets = args.num_targets, + add_lda = args.add_lda, + cnn_layer = args.cnn_layer, + cnn_bottleneck_dim = args.cnn_bottleneck_dim, + cepstral_lifter = args.cepstral_lifter, + nonlin_input_dim = args.nonlin_input_dim, + nonlin_output_dim = args.nonlin_output_dim, + subset_dim = args.subset_dim, + pool_type = args.pool_type, pool_window = args.pool_window, + pool_lpfilter_width = args.pool_lpfilter_width, + use_presoftmax_prior_scale = args.use_presoftmax_prior_scale, + final_layer_normalize_target = args.final_layer_normalize_target, + include_log_softmax = args.include_log_softmax, + add_final_sigmoid = args.add_final_sigmoid, + xent_regularize = args.xent_regularize, + xent_separate_forward_affine = args.xent_separate_forward_affine, + self_repair_scale = args.self_repair_scale, + objective_type = args.objective_type) + +if __name__ == "__main__": + Main() + diff --git a/egs/wsj/s5/steps/nnet2/train_multisplice_accel2_fix.sh b/egs/wsj/s5/steps/nnet3/tdnn/train.sh similarity index 54% rename from egs/wsj/s5/steps/nnet2/train_multisplice_accel2_fix.sh rename to egs/wsj/s5/steps/nnet3/tdnn/train.sh index 7e5990bc5e5..e21f5403737 100755 --- a/egs/wsj/s5/steps/nnet2/train_multisplice_accel2_fix.sh +++ b/egs/wsj/s5/steps/nnet3/tdnn/train.sh @@ -1,18 +1,14 @@ #!/bin/bash -# Copyright 2012-2014 Johns Hopkins University (Author: Daniel Povey). +# note, TDNN is the same as what we used to call multisplice. + +# Copyright 2012-2015 Johns Hopkins University (Author: Daniel Povey). # 2013 Xiaohui Zhang # 2013 Guoguo Chen # 2014 Vimal Manohar # 2014 Vijayaditya Peddinti # Apache 2.0. -# train_multisplice_accel2.sh is a modified version of -# train_pnorm_multisplice2.sh (still using pnorm). The "accel" refers to the -# fact that we increase the number of jobs during training (from -# --num-jobs-initial to --num-jobs-final). We dropped "pnorm" from the name as -# it was getting too long. - # Begin configuration section. cmd=run.pl @@ -20,26 +16,24 @@ num_epochs=15 # Number of epochs of training; # the number of iterations is worked out from this. initial_effective_lrate=0.01 final_effective_lrate=0.001 -bias_stddev=0.5 -pnorm_input_dim=3000 +pnorm_input_dim=3000 pnorm_output_dim=300 -minibatch_size=128 # by default use a smallish minibatch size for neural net - # training; this controls instability which would otherwise - # be a problem with multi-threaded update. - +relu_dim= # you can use this to make it use ReLU's instead of p-norms. +rand_prune=4.0 # Relates to a speedup we do for LDA. +minibatch_size=512 # This default is suitable for GPU-based training. + # Set it to 128 for multi-threaded CPU-based training. +max_param_change=2.0 # max param change per minibatch samples_per_iter=400000 # each iteration of training, see this many samples # per job. This option is passed to get_egs.sh num_jobs_initial=1 # Number of neural net jobs to run in parallel at the start of training num_jobs_final=8 # Number of neural net jobs to run in parallel at the end of training -prior_subset_size=10000 # 10k samples per job, for computing priors. Should be - # more than enough. +prior_subset_size=20000 # 20k samples per job, for computing priors. num_jobs_compute_prior=10 # these are single-threaded, run on CPU. -get_egs_stage=0 -fix_nnet=true -min_average=0.05 -max_average=0.95 +get_egs_stage=0 # can be used for rerunning after partial online_ivector_dir= -remove_egs=false # set to false to disable removing egs. +presoftmax_prior_scale_power=-0.25 +use_presoftmax_prior_scale=true +remove_egs=true # set to false to disable removing egs after training is done. max_models_combine=20 # The "max_models_combine" is the maximum number of models we give # to the final 'combine' stage, but these models will themselves be averages of @@ -55,51 +49,37 @@ shuffle_buffer_size=5000 # This "buffer_size" variable controls randomization of # affect each others' gradients. add_layers_period=2 # by default, add new layers every 2 iterations. -num_hidden_layers=3 -stage=-4 +stage=-6 exit_stage=-100 # you can set this to terminate the training early. Exits before running this stage -splice_indexes="layer0/-4:-3:-2:-1:0:1:2:3:4 layer2/-5:-1:3" +# count space-separated fields in splice_indexes to get num-hidden-layers. +splice_indexes="-4,-3,-2,-1,0,1,2,3,4 0 -2,2 0 -4,4 0" # Format : layer/....layer/ " # note: hidden layers which are composed of one or more components, # so hidden layer indexing is different from component count +chunk_training=false # if true training is done with chunk randomization, rather than frame randomization - -io_opts="-tc 5" # for jobs with a lot of I/O, limits the number running at one time. These don't randprune=4.0 # speeds up LDA. -alpha=4.0 # relates to preconditioning. -update_period=4 # relates to online preconditioning: says how often we update the subspace. -num_samples_history=2000 # relates to online preconditioning -max_change_per_sample=0.075 -precondition_rank_in=20 # relates to online preconditioning -precondition_rank_out=80 # relates to online preconditioning - -mix_up=0 # Number of components to mix up to (should be > #tree leaves, if - # specified.) -num_threads=16 -parallel_opts="-pe smp 16 -l ram_free=1G,mem_free=1G" - # by default we use 16 threads; this lets the queue know. - # note: parallel_opts doesn't automatically get adjusted if you adjust num-threads. -combine_num_threads=8 -combine_parallel_opts="-pe smp 8" # queue options for the "combine" stage. +use_gpu=true # if true, we run on GPU. cleanup=true egs_dir= +max_lda_jobs=10 # use no more than 10 jobs for the LDA accumulation. lda_opts= -lda_dim= egs_opts= -transform_dir= # If supplied, overrides alidir -cmvn_opts= # will be passed to get_lda.sh and get_egs.sh, if supplied. +transform_dir= # If supplied, this dir used instead of alidir to find transforms. +cmvn_opts= # will be passed to get_lda.sh and get_egs.sh, if supplied. # only relevant for "raw" features, not lda. -feat_type= # Can be used to force "raw" features. +feat_type=raw # or set to 'lda' to use LDA features. align_cmd= # The cmd that is passed to steps/nnet2/align.sh align_use_gpu= # Passed to use_gpu in steps/nnet2/align.sh [yes/no] -realign_times= # List of times on which we realign. Each time is +realign_times= # List of times on which we realign. Each time is # floating point number strictly between 0 and 1, which # will be multiplied by the num-iters to get an iteration # number. num_jobs_align=30 # Number of jobs for realignment # End configuration section. -frames_per_eg=8 # to be passed on to get_egs2.sh +frames_per_eg=8 # to be passed on to get_egs.sh +subset_dim=0 trap 'for pid in $(jobs -pr); do kill -KILL $pid; done' INT QUIT TERM @@ -121,17 +101,16 @@ if [ $# != 4 ]; then echo " # data, 0.00025 for large data" echo " --num-hidden-layers <#hidden-layers|2> # Number of hidden layers, e.g. 2 for 3 hours of data, 4 for 100hrs" echo " --add-layers-period <#iters|2> # Number of iterations between adding hidden layers" - echo " --mix-up <#pseudo-gaussians|0> # Can be used to have multiple targets in final output layer," - echo " # per context-dependent state. Try a number several times #states." + echo " --presoftmax-prior-scale-power # use the specified power value on the priors (inverse priors) to scale" + echo " # the pre-softmax outputs (set to 0.0 to disable the presoftmax element scale)" echo " --num-jobs-initial # Number of parallel jobs to use for neural net training, at the start." echo " --num-jobs-final # Number of parallel jobs to use for neural net training, at the end" - echo " --num-threads # Number of parallel threads per job (will affect results" - echo " # as well as speed; may interact with batch size; if you increase" + echo " --num-threads # Number of parallel threads per job, for CPU-based training (will affect" + echo " # results as well as speed; may interact with batch size; if you increase" echo " # this, you may want to decrease the batch size." echo " --parallel-opts # extra options to pass to e.g. queue.pl for processes that" echo " # use multiple threads... note, you might have to reduce mem_free,ram_free" echo " # versus your defaults, because it gets multiplied by the -pe smp argument." - echo " --io-opts # Options given to e.g. queue.pl for jobs that do a lot of I/O." echo " --minibatch-size # Size of minibatch to process (note: product with --num-threads" echo " # should not get too large, e.g. >2k)." echo " --samples-per-iter <#samples|400000> # Number of samples of data to process per iteration, per" @@ -141,15 +120,15 @@ if [ $# != 4 ]; then echo " # Format : layer/....layer/ " echo " # (note: we splice processed, typically 40-dimensional frames" echo " --lda-dim # Dimension to reduce spliced features to with LDA" - echo " --realign-epochs # A list of space-separated epoch indices the beginning of which" - echo " # realignment is to be done" + echo " --realign-times # A list of space-separated floating point numbers between 0.0 and" + echo " # 1.0 to specify how far through training realignment is to be done" echo " --align-cmd (utils/run.pl|utils/queue.pl ) # passed to align.sh" echo " --align-use-gpu (yes/no) # specify is gpu is to be used for realignment" echo " --num-jobs-align <#njobs|30> # Number of jobs to perform realignment" echo " --stage # Used to run a partially-completed training process from somewhere in" echo " # the middle." - + exit 1; fi @@ -183,62 +162,116 @@ mkdir -p $dir/log echo $nj > $dir/num_jobs cp $alidir/tree $dir -# process the splice_inds string, to get a layer-wise context string -# to be processed by the nnet-components -# this would be mainly used by SpliceComponent|SpliceMaxComponent -python steps/nnet2/make_multisplice_configs.py contexts --splice-indexes "$splice_indexes" $dir || exit -1; -context_string=$(cat $dir/vars) || exit -1 -echo $context_string -eval $context_string || exit -1; # - # initializes variables used by get_lda.sh and get_egs.sh - # get_lda.sh : first_left_context, first_right_context, - # get_egs.sh : nnet_left_context & nnet_right_context - -extra_opts=() -[ ! -z "$cmvn_opts" ] && extra_opts+=(--cmvn-opts "$cmvn_opts") -[ ! -z "$feat_type" ] && extra_opts+=(--feat-type $feat_type) -[ ! -z "$online_ivector_dir" ] && extra_opts+=(--online-ivector-dir $online_ivector_dir) -[ -z "$transform_dir" ] && transform_dir=$alidir -extra_opts+=(--transform-dir $transform_dir) -if [ $stage -le -4 ]; then - echo "$0: calling get_lda.sh" - steps/nnet2/get_lda.sh $lda_opts "${extra_opts[@]}" --left-context $first_left_context --right-context $first_right_context --cmd "$cmd" $data $lang $alidir $dir || exit 1; +# First work out the feature and iVector dimension, needed for tdnn config creation. +case $feat_type in + raw) feat_dim=$(feat-to-dim --print-args=false scp:$data/feats.scp -) || \ + { echo "$0: Error getting feature dim"; exit 1; } + ;; + lda) [ ! -f $alidir/final.mat ] && echo "$0: With --feat-type lda option, expect $alidir/final.mat to exist." + # get num-rows in lda matrix, which is the lda feature dim. + feat_dim=$(matrix-dim --print-args=false $alidir/final.mat | cut -f 1) + ;; + *) + echo "$0: Bad --feat-type '$feat_type';"; exit 1; +esac +if [ -z "$online_ivector_dir" ]; then + ivector_dim=0 +else + ivector_dim=$(feat-to-dim scp:$online_ivector_dir/ivector_online.scp -) || exit 1; +fi + + +if [ $stage -le -5 ]; then + echo "$0: creating neural net configs"; + + if [ ! -z "$relu_dim" ]; then + dim_opts="--relu-dim $relu_dim" + else + dim_opts="--pnorm-input-dim $pnorm_input_dim --pnorm-output-dim $pnorm_output_dim" + fi + + # create the config files for nnet initialization + python steps/nnet3/tdnn/make_configs.py \ + --splice-indexes "$splice_indexes" \ + --subset-dim "$subset_dim" \ + --feat-dim $feat_dim \ + --ivector-dim $ivector_dim \ + $dim_opts \ + --use-presoftmax-prior-scale $use_presoftmax_prior_scale \ + --num-targets $num_leaves \ + $dir/configs || exit 1; + + # Initialize as "raw" nnet, prior to training the LDA-like preconditioning + # matrix. This first config just does any initial splicing that we do; + # we do this as it's a convenient way to get the stats for the 'lda-like' + # transform. + $cmd $dir/log/nnet_init.log \ + nnet3-init --srand=-2 $dir/configs/init.config $dir/init.raw || exit 1; fi -# these files will have been written by get_lda.sh -feat_dim=$(cat $dir/feat_dim) || exit 1; -ivector_dim=$(cat $dir/ivector_dim) || exit 1; -lda_dim=$(cat $dir/lda_dim) || exit 1; -if [ $stage -le -3 ] && [ -z "$egs_dir" ]; then +# sourcing the "vars" below sets +# left_context=(something) +# right_context=(something) +# num_hidden_layers=(something) +. $dir/configs/vars || exit 1; + +context_opts="--left-context=$left_context --right-context=$right_context" + +! [ "$num_hidden_layers" -gt 0 ] && echo \ + "$0: Expected num_hidden_layers to be defined" && exit 1; + +[ -z "$transform_dir" ] && transform_dir=$alidir + - extra_opts+=(--left-context $nnet_left_context ) - extra_opts+=(--right-context $nnet_right_context ) - echo "$0: calling get_egs2.sh" - steps/nnet2/get_egs2.sh $egs_opts "${extra_opts[@]}" \ +if [ $stage -le -4 ] && [ -z "$egs_dir" ]; then + extra_opts=() + [ ! -z "$cmvn_opts" ] && extra_opts+=(--cmvn-opts "$cmvn_opts") + [ ! -z "$feat_type" ] && extra_opts+=(--feat-type $feat_type) + [ ! -z "$online_ivector_dir" ] && extra_opts+=(--online-ivector-dir $online_ivector_dir) + extra_opts+=(--transform-dir $transform_dir) + extra_opts+=(--left-context $left_context) + extra_opts+=(--right-context $right_context) + echo "$0: calling get_egs.sh" + steps/nnet3/get_egs.sh $egs_opts "${extra_opts[@]}" \ --samples-per-iter $samples_per_iter --stage $get_egs_stage \ - --io-opts "$io_opts" \ --cmd "$cmd" $egs_opts \ --frames-per-eg $frames_per_eg \ $data $alidir $dir/egs || exit 1; fi -if [ -z $egs_dir ]; then - egs_dir=$dir/egs - # confirm that the provided egs_dir has the necessary context - egs_left_context=$(cat $egs_dir/info/left_context) || exit -1 - egs_right_context=$(cat $egs_dir/info/right_context) || exit -1 - echo $egs_left_context $nnet_left_context $egs_right_context $nnet_right_context - ([[ $egs_left_context -lt $nnet_left_context ]] || [[ $egs_right_context -lt $nnet_right_context ]]) && - echo "Provided egs_dir $egs_dir does not have sufficient context to train the neural network." && exit -1; +[ -z $egs_dir ] && egs_dir=$dir/egs + +if [ "$feat_dim" != "$(cat $egs_dir/info/feat_dim)" ]; then + echo "$0: feature dimension mismatch with egs, $feat_dim vs $(cat $egs_dir/info/feat_dim)"; + exit 1; +fi +if [ "$ivector_dim" != "$(cat $egs_dir/info/ivector_dim)" ]; then + echo "$0: ivector dimension mismatch with egs, $ivector_dim vs $(cat $egs_dir/info/ivector_dim)"; + exit 1; fi +# copy any of the following that exist, to $dir. +cp $egs_dir/{cmvn_opts,splice_opts,final.mat} $dir 2>/dev/null + +# confirm that the egs_dir has the necessary context (especially important if +# the --egs-dir option was used on the command line). +egs_left_context=$(cat $egs_dir/info/left_context) || exit -1 +egs_right_context=$(cat $egs_dir/info/right_context) || exit -1 + ( [ $egs_left_context -lt $left_context ] || \ + [ $egs_right_context -lt $right_context ] ) && \ + echo "$0: egs in $egs_dir have too little context" && exit -1; + frames_per_eg=$(cat $egs_dir/info/frames_per_eg) || { echo "error: no such file $egs_dir/info/frames_per_eg"; exit 1; } num_archives=$(cat $egs_dir/info/num_archives) || { echo "error: no such file $egs_dir/info/frames_per_eg"; exit 1; } # num_archives_expanded considers each separate label-position from # 0..frames_per_eg-1 to be a separate archive. -num_archives_expanded=$[$num_archives*$frames_per_eg] +if [ "$chunk_training" == "true" ]; then + num_archives_expanded=$num_archives +else + num_archives_expanded=$[$num_archives*$frames_per_eg] +fi [ $num_jobs_initial -gt $num_jobs_final ] && \ echo "$0: --initial-num-jobs cannot exceed --final-num-jobs" && exit 1; @@ -246,49 +279,67 @@ num_archives_expanded=$[$num_archives*$frames_per_eg] [ $num_jobs_final -gt $num_archives_expanded ] && \ echo "$0: --final-num-jobs cannot exceed #archives $num_archives_expanded." && exit 1; -if ! [ $num_hidden_layers -ge 1 ]; then - echo "Invalid num-hidden-layers $num_hidden_layers" - exit 1 -fi -if [ $stage -le -2 ]; then - echo "$0: initializing neural net"; - lda_mat=$dir/lda.mat - tot_input_dim=$[$feat_dim+$ivector_dim] +if [ $stage -le -3 ]; then + echo "$0: getting preconditioning matrix for input features." + num_lda_jobs=$num_archives + [ $num_lda_jobs -gt $max_lda_jobs ] && num_lda_jobs=$max_lda_jobs - online_preconditioning_opts="alpha=$alpha num-samples-history=$num_samples_history update-period=$update_period rank-in=$precondition_rank_in rank-out=$precondition_rank_out max-change-per-sample=$max_change_per_sample" + # Write stats with the same format as stats for LDA. + $cmd JOB=1:$num_lda_jobs $dir/log/get_lda_stats.JOB.log \ + nnet3-acc-lda-stats --rand-prune=$rand_prune \ + $dir/init.raw "ark:$egs_dir/egs.JOB.ark" $dir/JOB.lda_stats || exit 1; - initial_lrate=$(perl -e "print ($initial_effective_lrate*$num_jobs_initial);") + all_lda_accs=$(for n in $(seq $num_lda_jobs); do echo $dir/$n.lda_stats; done) + $cmd $dir/log/sum_transform_stats.log \ + sum-lda-accs $dir/lda_stats $all_lda_accs || exit 1; - # create the config files for nnet initialization - python steps/nnet2/make_multisplice_configs.py \ - --splice-indexes "$splice_indexes" \ - --total-input-dim $tot_input_dim \ - --ivector-dim $ivector_dim \ - --lda-mat "$lda_mat" \ - --lda-dim $lda_dim \ - --pnorm-input-dim $pnorm_input_dim \ - --pnorm-output-dim $pnorm_output_dim \ - --online-preconditioning-opts "$online_preconditioning_opts" \ - --initial-learning-rate $initial_lrate \ - --bias-stddev $bias_stddev \ - --num-hidden-layers $num_hidden_layers \ - --num-targets $num_leaves \ - configs $dir || exit -1; + rm $all_lda_accs || exit 1; - $cmd $dir/log/nnet_init.log \ - nnet-am-init $alidir/tree $lang/topo "nnet-init $dir/nnet.config -|" \ - $dir/0.mdl || exit 1; + # this computes a fixed affine transform computed in the way we described in + # Appendix C.6 of http://arxiv.org/pdf/1410.7455v6.pdf; it's a scaled variant + # of an LDA transform but without dimensionality reduction. + $cmd $dir/log/get_transform.log \ + nnet-get-feature-transform $lda_opts $dir/lda.mat $dir/lda_stats || exit 1; + + ln -sf ../lda.mat $dir/configs/lda.mat +fi + + +if [ $stage -le -2 ]; then + echo "$0: preparing initial vector for FixedScaleComponent before softmax" + echo " ... using priors^$presoftmax_prior_scale_power and rescaling to average 1" + + # obtains raw pdf count + $cmd JOB=1:$nj $dir/log/acc_pdf.JOB.log \ + ali-to-post "ark:gunzip -c $alidir/ali.JOB.gz|" ark:- \| \ + post-to-tacc --per-pdf=true $alidir/final.mdl ark:- $dir/pdf_counts.JOB || exit 1; + $cmd $dir/log/sum_pdf_counts.log \ + vector-sum --binary=false $dir/pdf_counts.* $dir/pdf_counts || exit 1; + rm $dir/pdf_counts.* + + awk -v power=$presoftmax_prior_scale_power -v smooth=0.01 \ + '{ for(i=2; i<=NF-1; i++) { count[i-2] = $i; total += $i; } + num_pdfs=NF-2; average_count = total/num_pdfs; + for (i=0; i $dir/presoftmax_prior_scale.vec + ln -sf ../presoftmax_prior_scale.vec $dir/configs/presoftmax_prior_scale.vec fi -#if [ $pnorm_input_dim -eq $pnorm_output_dim ]; then fix_nnet=true;fi if [ $stage -le -1 ]; then - echo "Training transition probabilities and setting priors" - $cmd $dir/log/train_trans.log \ - nnet-train-transitions $dir/0.mdl "ark:gunzip -c $alidir/ali.*.gz|" $dir/0.mdl \ - || exit 1; + # Add the first layer; this will add in the lda.mat and + # presoftmax_prior_scale.vec. + $cmd $dir/log/add_first_layer.log \ + nnet3-init --srand=-3 $dir/init.raw $dir/configs/layer1.config $dir/0.raw || exit 1; + + # Convert to .mdl, train the transitions, set the priors. + $cmd $dir/log/init_mdl.log \ + nnet3-am-init $alidir/final.mdl $dir/0.raw - \| \ + nnet3-am-train-transitions - "ark:gunzip -c $alidir/ali.*.gz|" $dir/0.mdl || exit 1; fi + # set num_iters so that as close as possible, we process the data $num_epochs # times, i.e. $num_iters*$avg_num_jobs) == $num_epochs*$num_archives_expanded, # where avg_num_jobs=(num_jobs_initial+num_jobs_final)/2. @@ -302,59 +353,50 @@ num_iters=$[($num_archives_to_process*2)/($num_jobs_initial+$num_jobs_final)] finish_add_layers_iter=$[$num_hidden_layers * $add_layers_period] - -# mix up at the iteration where we've processed about half the data; this keeps -# the overall training procedure fairly invariant to the number of initial and -# final jobs. -# j = initial, k = final, n = num-iters, x = half-of-data epoch, -# p is proportion of data we want to process (e.g. p=0.5 here). -# solve for x if the amount of data processed by epoch x is p -# times the amount by iteration n. -# put this in wolfram alpha: -# solve { x*j + (k-j)*x*x/(2*n) = p * (j*n + (k-j)*n/2), {x} } -# got: x = (j n-sqrt(-n^2 (j^2 (p-1)-k^2 p)))/(j-k) and j!=k and n!=0 -# simplified manually to: n * (sqrt(((1-p)j^2 + p k^2)/2) - j)/(j-k) -mix_up_iter=$(perl -e '($j,$k,$n,$p)=@ARGV; print int(0.5 + ($j==$k ? $n*$p : $n*(sqrt((1-$p)*$j*$j+$p*$k*$k)-$j)/($k-$j))); ' $num_jobs_initial $num_jobs_final $num_iters 0.5) -! [ $mix_up_iter -gt $finish_add_layers_iter ] && \ - echo "Mix-up-iter is $mix_up_iter, should be greater than $finish_add_layers_iter -> add more epochs?" \ - && exit 1; - echo "$0: Will train for $num_epochs epochs = $num_iters iterations" -[ $mix_up -gt 0 ] && echo "$0: Will mix up on iteration $mix_up_iter" -if [ $num_threads -eq 1 ]; then - parallel_suffix="-simple" # this enables us to use GPU code if - # we have just one thread. +if $use_gpu; then + parallel_suffix="" + train_queue_opt="--gpu 1" + combine_queue_opt="--gpu 1" + prior_gpu_opt="--use-gpu=yes" + prior_queue_opt="--gpu 1" parallel_train_opts= if ! cuda-compiled; then echo "$0: WARNING: you are running with one thread but you have not compiled" echo " for CUDA. You may be running a setup optimized for GPUs. If you have" echo " GPUs and have nvcc installed, go to src/ and do ./configure; make" + exit 1 fi else - parallel_suffix="-parallel" - parallel_train_opts="--num-threads=$num_threads" + echo "$0: without using a GPU this will be very slow. nnet3 does not yet support multiple threads." + parallel_train_opts="--use-gpu=no" + combine_queue_opt="" # the combine stage will be quite slow if not using + # GPU, as we didn't enable that program to use + # multiple threads. + prior_gpu_opt="--use-gpu=no" + prior_queue_opt="" fi approx_iters_per_epoch_final=$[$num_archives_expanded/$num_jobs_final] -# First work out how many models we want to combine over in the final -# nnet-combine-fast invocation. This equals +# First work out how many iterations we want to combine over in the final +# nnet3-combine-fast invocation. (We may end up subsampling from these if the +# number exceeds max_model_combine). The number we use is: # min(max(max_models_combine, approx_iters_per_epoch_final), -# 2/3 * iters_after_mixup) -num_models_combine=$max_models_combine -if [ $num_models_combine -lt $approx_iters_per_epoch_final ]; then - num_models_combine=$approx_iters_per_epoch_final +# 1/2 * iters_after_last_layer_added) +num_iters_combine=$max_models_combine +if [ $num_iters_combine -lt $approx_iters_per_epoch_final ]; then + num_iters_combine=$approx_iters_per_epoch_final fi -iters_after_mixup_23=$[(($num_iters-$mix_up_iter-1)*2)/3] -if [ $num_models_combine -gt $iters_after_mixup_23 ]; then - num_models_combine=$iters_after_mixup_23 +half_iters_after_add_layers=$[($num_iters-$finish_add_layers_iter)/2] +if [ $num_iters_combine -gt $half_iters_after_add_layers ]; then + num_iters_combine=$half_iters_after_add_layers fi -first_model_combine=$[$num_iters-$num_models_combine+1] +first_model_combine=$[$num_iters-$num_iters_combine+1] x=0 - for realign_time in $realign_times; do # Work out the iterations on which we will re-align, if the --realign-times # option was used. This is slightly approximate. @@ -369,13 +411,13 @@ cur_egs_dir=$egs_dir while [ $x -lt $num_iters ]; do [ $x -eq $exit_stage ] && echo "$0: Exiting early due to --exit-stage $exit_stage" && exit 0; - if [ $x -gt $[$num_iters/2] ]; then fix_nnet=false; fi + this_num_jobs=$(perl -e "print int(0.5+$num_jobs_initial+($num_jobs_final-$num_jobs_initial)*$x/$num_iters);") ilr=$initial_effective_lrate; flr=$final_effective_lrate; np=$num_archives_processed; nt=$num_archives_to_process; this_learning_rate=$(perl -e "print (($x + 1 >= $num_iters ? $flr : $ilr*exp($np*log($flr/$ilr)/$nt))*$this_num_jobs);"); - echo "On iteration $x, learning rate is $this_learning_rate." + echo "On iteration $x, learning rate is $this_learning_rate." if [ ! -z "${realign_this_iter[$x]}" ]; then prev_egs_dir=$cur_egs_dir @@ -392,9 +434,10 @@ while [ $x -lt $num_iters ]; do # we're using different random subsets of it. rm $dir/post.$x.*.vec 2>/dev/null $cmd JOB=1:$num_jobs_compute_prior $dir/log/get_post.$x.JOB.log \ - nnet-copy-egs --srand=JOB --frame=random ark:$prev_egs_dir/egs.1.ark ark:- \| \ - nnet-subset-egs --srand=JOB --n=$prior_subset_size ark:- ark:- \| \ - nnet-compute-from-egs "nnet-to-raw-nnet $dir/$x.mdl -|" ark:- ark:- \| \ + nnet3-copy-egs --srand=JOB --frame=random $context_opts ark:$prev_egs_dir/egs.1.ark ark:- \| \ + nnet3-subset-egs --srand=JOB --n=$prior_subset_size ark:- ark:- \| \ + nnet3-merge-egs ark:- ark:- \| \ + nnet3-compute-from-egs --apply-exp=true "nnet3-am-copy --raw=true $dir/$x.mdl -|" ark:- ark:- \| \ matrix-sum-rows ark:- ark:- \| vector-sum ark:- $dir/post.$x.JOB.vec || exit 1; sleep 3; # make sure there is time for $dir/post.$x.*.vec to appear. @@ -405,33 +448,36 @@ while [ $x -lt $num_iters ]; do echo "Re-adjusting priors based on computed posteriors" $cmd $dir/log/adjust_priors.$x.log \ - nnet-adjust-priors $dir/$x.mdl $dir/post.$x.vec $dir/$x.mdl || exit 1; + nnet3-am-adjust-priors $dir/$x.mdl $dir/post.$x.vec $dir/$x.mdl || exit 1; sleep 2 - steps/nnet2/align.sh --nj $num_jobs_align --cmd "$align_cmd" --use-gpu $align_use_gpu \ + steps/nnet3/align.sh --nj $num_jobs_align --cmd "$align_cmd" --use-gpu $align_use_gpu \ --transform-dir "$transform_dir" --online-ivector-dir "$online_ivector_dir" \ --iter $x $data $lang $dir $dir/ali_$time || exit 1 - steps/nnet2/relabel_egs2.sh --cmd "$cmd" --iter $x $dir/ali_$time \ + steps/nnet3/relabel_egs.sh --cmd "$cmd" --iter $x $dir/ali_$time \ $prev_egs_dir $cur_egs_dir || exit 1 if $cleanup && [[ $prev_egs_dir =~ $dir/egs* ]]; then - steps/nnet2/remove_egs.sh $prev_egs_dir + steps/nnet3/remove_egs.sh $prev_egs_dir fi fi - + # Set off jobs doing some diagnostics, in the background. # Use the egs dir from the previous iteration for the diagnostics $cmd $dir/log/compute_prob_valid.$x.log \ - nnet-compute-prob $dir/$x.mdl ark:$cur_egs_dir/valid_diagnostic.egs & + nnet3-compute-prob "nnet3-am-copy --raw=true $dir/$x.mdl - |" \ + "ark,bg:nnet3-merge-egs ark:$cur_egs_dir/valid_diagnostic.egs ark:- |" & $cmd $dir/log/compute_prob_train.$x.log \ - nnet-compute-prob $dir/$x.mdl ark:$cur_egs_dir/train_diagnostic.egs & - if [ $x -gt 0 ] && [ ! -f $dir/log/mix_up.$[$x-1].log ]; then + nnet3-compute-prob "nnet3-am-copy --raw=true $dir/$x.mdl - |" \ + "ark,bg:nnet3-merge-egs ark:$cur_egs_dir/train_diagnostic.egs ark:- |" & + + if [ $x -gt 0 ]; then $cmd $dir/log/progress.$x.log \ - nnet-show-progress --use-gpu=no $dir/$[$x-1].mdl $dir/$x.mdl \ - ark:$cur_egs_dir/train_diagnostic.egs '&&' \ - nnet-am-info $dir/$x.mdl & + nnet3-show-progress --use-gpu=no "nnet3-am-copy --raw=true $dir/$[$x-1].mdl - |" "nnet3-am-copy --raw=true $dir/$x.mdl - |" \ + "ark,bg:nnet3-merge-egs ark:$cur_egs_dir/train_diagnostic.egs ark:-|" '&&' \ + nnet3-info "nnet3-am-copy --raw=true $dir/$x.mdl - |" & fi echo "Training neural net (pass $x)" @@ -439,22 +485,24 @@ while [ $x -lt $num_iters ]; do if [ $x -gt 0 ] && \ [ $x -le $[($num_hidden_layers-1)*$add_layers_period] ] && \ [ $[$x%$add_layers_period] -eq 0 ]; then - do_average=false # if we've just mixed up, don't do averaging take the best. - cur_num_hidden_layers=$[$x/$add_layers_period]; - mdl="nnet-init --srand=$x $dir/hidden_${cur_num_hidden_layers}.config - | nnet-insert $dir/$x.mdl - - | nnet-am-copy --learning-rate=$this_learning_rate - -|" + do_average=false # if we've just mixed up, don't do averaging but take the + # best. + cur_num_hidden_layers=$[1+$x/$add_layers_period] + config=$dir/configs/layer$cur_num_hidden_layers.config + raw="nnet3-am-copy --raw=true --learning-rate=$this_learning_rate $dir/$x.mdl - | nnet3-init --srand=$x - $config - |" else do_average=true if [ $x -eq 0 ]; then do_average=false; fi # on iteration 0, pick the best, don't average. - mdl="nnet-am-copy --learning-rate=$this_learning_rate $dir/$x.mdl -|" + raw="nnet3-am-copy --raw=true --learning-rate=$this_learning_rate $dir/$x.mdl -|" fi if $do_average; then this_minibatch_size=$minibatch_size else # on iteration zero or when we just added a layer, use a smaller minibatch - # size and just one job: the model-averaging doesn't seem to be helpful - # when the model is changing too fast (i.e. it worsens the objective - # function), and the smaller minibatch size will help to keep - # the update stable. + # size (and we will later choose the output of just one of the jobs): the + # model-averaging isn't always helpful when the model is changing too fast + # (i.e. it can worsen the objective function), and the smaller minibatch + # size will help to keep the update stable. this_minibatch_size=$[$minibatch_size/2]; fi @@ -464,7 +512,7 @@ while [ $x -lt $num_iters ]; do ( # this sub-shell is so that when we "wait" below, # we only wait for the training jobs that we just spawned, # not the diagnostic jobs that we spawned above. - + # We can't easily use a single parallel SGE job to do the main training, # because the computation of which archive and which --frame option # to use for each job is a little complex, so we spawn each one separately. @@ -477,11 +525,11 @@ while [ $x -lt $num_iters ]; do # same archive with different frame indexes will give similar gradients, # so we want to separate them in time. - $cmd $parallel_opts $dir/log/train.$x.$n.log \ - nnet-train$parallel_suffix $parallel_train_opts \ - --minibatch-size=$this_minibatch_size --srand=$x "$mdl" \ - "ark:nnet-copy-egs --frame=$frame ark:$cur_egs_dir/egs.$archive.ark ark:-|nnet-shuffle-egs --buffer-size=$shuffle_buffer_size --srand=$x ark:- ark:-|" \ - $dir/$[$x+1].$n.mdl || touch $dir/.error & + $cmd $train_queue_opt $dir/log/train.$x.$n.log \ + nnet3-train $parallel_train_opts \ + --max-param-change=$max_param_change "$raw" \ + "ark,bg:nnet3-copy-egs --frame=$frame $context_opts ark:$cur_egs_dir/egs.$archive.ark ark:- | nnet3-shuffle-egs --buffer-size=$shuffle_buffer_size --srand=$x ark:- ark:-| nnet3-merge-egs --minibatch-size=$this_minibatch_size --discard-partial-minibatches=true ark:- ark:- |" \ + $dir/$[$x+1].$n.raw || touch $dir/.error & done wait ) @@ -491,36 +539,26 @@ while [ $x -lt $num_iters ]; do nnets_list= for n in `seq 1 $this_num_jobs`; do - nnets_list="$nnets_list $dir/$[$x+1].$n.mdl" + nnets_list="$nnets_list $dir/$[$x+1].$n.raw" done if $do_average; then # average the output of the different jobs. $cmd $dir/log/average.$x.log \ - nnet-am-average $nnets_list $dir/$[$x+1].mdl || exit 1; + nnet3-average $nnets_list - \| \ + nnet3-am-copy --set-raw-nnet=- $dir/$x.mdl $dir/$[$x+1].mdl || exit 1; else # choose the best from the different jobs. n=$(perl -e '($nj,$pat)=@ARGV; $best_n=1; $best_logprob=-1.0e+10; for ($n=1;$n<=$nj;$n++) { $fn = sprintf($pat,$n); open(F, "<$fn") || die "Error opening log file $fn"; undef $logprob; while () { if (m/log-prob-per-frame=(\S+)/) { $logprob=$1; } } - close(F); if (defined $logprob && $logprob > $best_logprob) { $best_logprob=$logprob; + close(F); if (defined $logprob && $logprob > $best_logprob) { $best_logprob=$logprob; $best_n=$n; } } print "$best_n\n"; ' $num_jobs_nnet $dir/log/train.$x.%d.log) || exit 1; [ -z "$n" ] && echo "Error getting best model" && exit 1; - cp $dir/$[$x+1].$n.mdl $dir/$[$x+1].mdl || exit 1; - fi - - if $fix_nnet; then - # do nnet-am-fix to fix some pathology in the network - nnet-am-fix --max-average-deriv=$max_average --min-average-deriv=$min_average $dir/$[$x+1].mdl $dir/$[$x+1].mdl 2>$dir/log/fix.$x.log || exit; + $cmd $dir/log/select.$x.log \ + nnet3-am-copy --set-raw-nnet=$dir/$[$x+1].$n.raw $dir/$x.mdl $dir/$[$x+1].mdl || exit 1; fi - if [ "$mix_up" -gt 0 ] && [ $x -eq $mix_up_iter ]; then - # mix up. - echo Mixing up from $num_leaves to $mix_up components - $cmd $dir/log/mix_up.$x.log \ - nnet-am-mixup --min-count=10 --num-mixtures=$mix_up \ - $dir/$[$x+1].mdl $dir/$[$x+1].mdl || exit 1; - fi rm $nnets_list [ ! -f $dir/$[$x+1].mdl ] && exit 1; if [ -f $dir/$[$x-1].mdl ] && $cleanup && \ @@ -536,77 +574,51 @@ done if [ $stage -le $num_iters ]; then echo "Doing final combination to produce final.mdl" - # Now do combination. + # Now do combination. In the nnet3 setup, the logic + # for doing averaging of subsets of the models in the case where + # there are too many models to reliably esetimate interpolation + # factors (max_models_combine) is moved into the nnet3-combine nnets_list=() - # the if..else..fi statement below sets 'nnets_list'. - if [ $max_models_combine -lt $num_models_combine ]; then - # The number of models to combine is too large, e.g. > 20. In this case, - # each argument to nnet-combine-fast will be an average of multiple models. - cur_offset=0 # current offset from first_model_combine. - for n in $(seq $max_models_combine); do - next_offset=$[($n*$num_models_combine)/$max_models_combine] - sub_list="" - for o in $(seq $cur_offset $[$next_offset-1]); do - iter=$[$first_model_combine+$o] - mdl=$dir/$iter.mdl - [ ! -f $mdl ] && echo "Expected $mdl to exist" && exit 1; - sub_list="$sub_list $mdl" - done - nnets_list[$[$n-1]]="nnet-am-average $sub_list - |" - cur_offset=$next_offset - done - else - nnets_list= - for n in $(seq 0 $[num_models_combine-1]); do - iter=$[$first_model_combine+$n] - mdl=$dir/$iter.mdl - [ ! -f $mdl ] && echo "Expected $mdl to exist" && exit 1; - nnets_list[$n]=$mdl - done - fi + for n in $(seq 0 $[num_iters_combine-1]); do + iter=$[$first_model_combine+$n] + mdl=$dir/$iter.mdl + [ ! -f $mdl ] && echo "Expected $mdl to exist" && exit 1; + nnets_list[$n]="nnet3-am-copy --raw=true $mdl -|"; + done + # Below, we use --use-gpu=no to disable nnet3-combine-fast from using a GPU, + # as if there are many models it can give out-of-memory error; and we set + # num-threads to 8 to speed it up (this isn't ideal...) - # Below, use --use-gpu=no to disable nnet-combine-fast from using a GPU, as - # if there are many models it can give out-of-memory error; set num-threads to 8 - # to speed it up (this isn't ideal...) - num_egs=`nnet-copy-egs ark:$cur_egs_dir/combine.egs ark:/dev/null 2>&1 | tail -n 1 | awk '{print $NF}'` - mb=$[($num_egs+$combine_num_threads-1)/$combine_num_threads] - [ $mb -gt 512 ] && mb=512 - # Setting --initial-model to a large value makes it initialize the combination - # with the average of all the models. It's important not to start with a - # single model, or, due to the invariance to scaling that these nonlinearities - # give us, we get zero diagonal entries in the fisher matrix that - # nnet-combine-fast uses for scaling, which after flooring and inversion, has - # the effect that the initial model chosen gets much higher learning rates - # than the others. This prevents the optimization from working well. - $cmd $combine_parallel_opts $dir/log/combine.log \ - nnet-combine-fast --initial-model=100000 --num-lbfgs-iters=40 --use-gpu=no \ - --num-threads=$combine_num_threads \ - --verbose=3 --minibatch-size=$mb "${nnets_list[@]}" ark:$cur_egs_dir/combine.egs \ - $dir/final.mdl || exit 1; - - # Normalize stddev for affine or block affine layers that are followed by a - # pnorm layer and then a normalize layer. - $cmd $dir/log/normalize.log \ - nnet-normalize-stddev $dir/final.mdl $dir/final.mdl || exit 1; + $cmd $combine_queue_opt $dir/log/combine.log \ + nnet3-combine --num-iters=40 \ + --enforce-sum-to-one=true --enforce-positive-weights=true \ + --verbose=3 "${nnets_list[@]}" "ark,bg:nnet3-merge-egs --minibatch-size=1024 ark:$cur_egs_dir/combine.egs ark:-|" \ + "|nnet3-am-copy --set-raw-nnet=- $dir/$num_iters.mdl $dir/combined.mdl" || exit 1; # Compute the probability of the final, combined model with # the same subset we used for the previous compute_probs, as the # different subsets will lead to different probs. $cmd $dir/log/compute_prob_valid.final.log \ - nnet-compute-prob $dir/final.mdl ark:$cur_egs_dir/valid_diagnostic.egs & + nnet3-compute-prob "nnet3-am-copy --raw=true $dir/combined.mdl -|" \ + "ark,bg:nnet3-merge-egs ark:$cur_egs_dir/valid_diagnostic.egs ark:- |" & $cmd $dir/log/compute_prob_train.final.log \ - nnet-compute-prob $dir/final.mdl ark:$cur_egs_dir/train_diagnostic.egs & + nnet3-compute-prob "nnet3-am-copy --raw=true $dir/combined.mdl -|" \ + "ark,bg:nnet3-merge-egs ark:$cur_egs_dir/train_diagnostic.egs ark:- |" & fi if [ $stage -le $[$num_iters+1] ]; then echo "Getting average posterior for purposes of adjusting the priors." # Note: this just uses CPUs, using a smallish subset of data. + if [ $num_jobs_compute_prior -gt $num_archives ]; then egs_part=1; + else egs_part=JOB; fi rm $dir/post.$x.*.vec 2>/dev/null - $cmd JOB=1:$num_jobs_compute_prior $dir/log/get_post.$x.JOB.log \ - nnet-copy-egs --frame=random --srand=JOB ark:$cur_egs_dir/egs.1.ark ark:- \| \ - nnet-subset-egs --srand=JOB --n=$prior_subset_size ark:- ark:- \| \ - nnet-compute-from-egs "nnet-to-raw-nnet $dir/final.mdl -|" ark:- ark:- \| \ + $cmd JOB=1:$num_jobs_compute_prior $prior_queue_opt $dir/log/get_post.$x.JOB.log \ + nnet3-copy-egs --frame=random $context_opts --srand=JOB ark:$cur_egs_dir/egs.$egs_part.ark ark:- \| \ + nnet3-subset-egs --srand=JOB --n=$prior_subset_size ark:- ark:- \| \ + nnet3-merge-egs ark:- ark:- \| \ + nnet3-compute-from-egs $prior_gpu_opt --apply-exp=true \ + "nnet3-am-copy --raw=true $dir/combined.mdl -|" ark:- ark:- \| \ matrix-sum-rows ark:- ark:- \| vector-sum ark:- $dir/post.$x.JOB.vec || exit 1; sleep 3; # make sure there is time for $dir/post.$x.*.vec to appear. @@ -618,7 +630,7 @@ if [ $stage -le $[$num_iters+1] ]; then echo "Re-adjusting priors based on computed posteriors" $cmd $dir/log/adjust_priors.final.log \ - nnet-adjust-priors $dir/final.mdl $dir/post.$x.vec $dir/final.mdl || exit 1; + nnet3-am-adjust-priors $dir/combined.mdl $dir/post.$x.vec $dir/final.mdl || exit 1; fi @@ -646,4 +658,3 @@ if $cleanup; then fi done fi - diff --git a/egs/wsj/s5/steps/nnet3/tdnn/train_raw_nnet.sh b/egs/wsj/s5/steps/nnet3/tdnn/train_raw_nnet.sh new file mode 100755 index 00000000000..6fe772f7e0d --- /dev/null +++ b/egs/wsj/s5/steps/nnet3/tdnn/train_raw_nnet.sh @@ -0,0 +1,547 @@ +#!/bin/bash + +# note, TDNN is the same as what we used to call multisplice. + +# Copyright 2012-2015 Johns Hopkins University (Author: Daniel Povey). +# 2013 Xiaohui Zhang +# 2013 Guoguo Chen +# 2014-2016 Vimal Manohar +# 2014 Vijayaditya Peddinti +# Apache 2.0. + + +# Begin configuration section. +cmd=run.pl +num_epochs=15 # Number of epochs of training; + # the number of iterations is worked out from this. +initial_effective_lrate=0.01 +final_effective_lrate=0.001 +rand_prune=4.0 # Relates to a speedup we do for LDA. +minibatch_size=512 # This default is suitable for GPU-based training. + # Set it to 128 for multi-threaded CPU-based training. +max_param_change=2.0 # max param change per minibatch +samples_per_iter=400000 # each iteration of training, see this many samples + # per job. This option is passed to get_egs.sh +num_jobs_initial=1 # Number of neural net jobs to run in parallel at the start of training +num_jobs_final=8 # Number of neural net jobs to run in parallel at the end of training +prior_subset_size=20000 # 20k samples per job, for computing priors. +num_jobs_compute_prior=10 # these are single-threaded, run on CPU. +get_egs_stage=0 # can be used for rerunning after partial +online_ivector_dir= +remove_egs=true # set to false to disable removing egs after training is done. + +max_models_combine=20 # The "max_models_combine" is the maximum number of models we give + # to the final 'combine' stage, but these models will themselves be averages of + # iteration-number ranges. + +shuffle_buffer_size=5000 # This "buffer_size" variable controls randomization of the samples + # on each iter. You could set it to 0 or to a large value for complete + # randomization, but this would both consume memory and cause spikes in + # disk I/O. Smaller is easier on disk and memory but less random. It's + # not a huge deal though, as samples are anyway randomized right at the start. + # (the point of this is to get data in different minibatches on different iterations, + # since in the preconditioning method, 2 samples in the same minibatch can + # affect each others' gradients. + +add_layers_period=2 # by default, add new layers every 2 iterations. +stage=-6 +exit_stage=-100 # you can set this to terminate the training early. Exits before running this stage + +chunk_training=false # if true training is done with chunk randomization, rather than frame randomization + +randprune=4.0 # speeds up LDA. +use_gpu=true # if true, we run on GPU. +cleanup=true +egs_dir= +configs_dir= +max_lda_jobs=10 # use no more than 10 jobs for the LDA accumulation. +lda_opts= +egs_opts= +transform_dir= # If supplied, this dir used instead of alidir to find transforms. +cmvn_opts= # will be passed to get_lda.sh and get_egs.sh, if supplied. +frames_per_eg=8 # to be passed on to get_egs.sh + +# Raw nnet training options i.e. without transition model +nj=4 +dense_targets=true # Use dense targets instead of sparse targets + +# End configuration section. + +trap 'for pid in $(jobs -pr); do kill -KILL $pid; done' INT QUIT TERM + +echo "$0 $@" # Print the command line for logging + +if [ -f path.sh ]; then . ./path.sh; fi +. parse_options.sh || exit 1; + +if [ $# != 3 ]; then + echo "Usage: $0 [opts] " + echo " e.g.: $0 data/train scp:snr_targets/targets.scp exp/nnet3_snr_predictor" + echo "" + echo "Main options (for others, see top of script file)" + echo " --config # config file containing options" + echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs." + echo " --num-epochs <#epochs|15> # Number of epochs of training" + echo " --initial-effective-lrate # effective learning rate at start of training." + echo " --final-effective-lrate # effective learning rate at end of training." + echo " # data, 0.00025 for large data" + echo " --num-hidden-layers <#hidden-layers|2> # Number of hidden layers, e.g. 2 for 3 hours of data, 4 for 100hrs" + echo " --add-layers-period <#iters|2> # Number of iterations between adding hidden layers" + echo " --num-jobs-initial # Number of parallel jobs to use for neural net training, at the start." + echo " --num-jobs-final # Number of parallel jobs to use for neural net training, at the end" + echo " --num-threads # Number of parallel threads per job, for CPU-based training (will affect" + echo " # results as well as speed; may interact with batch size; if you increase" + echo " # this, you may want to decrease the batch size." + echo " --parallel-opts # extra options to pass to e.g. queue.pl for processes that" + echo " # use multiple threads... note, you might have to reduce mem_free,ram_free" + echo " # versus your defaults, because it gets multiplied by the -pe smp argument." + echo " --minibatch-size # Size of minibatch to process (note: product with --num-threads" + echo " # should not get too large, e.g. >2k)." + echo " --samples-per-iter <#samples|400000> # Number of samples of data to process per iteration, per" + echo " # process." + echo " --splice-indexes " + echo " # Frame indices used for each splice layer." + echo " # Format : layer/....layer/ " + echo " # (note: we splice processed, typically 40-dimensional frames" + echo " --lda-dim # Dimension to reduce spliced features to with LDA" + echo " --stage # Used to run a partially-completed training process from somewhere in" + echo " # the middle." + + + exit 1; +fi + +data=$1 +targets_scp=$2 +dir=$3 + +# Check some files. +for f in $data/feats.scp $targets_scp; do + [ ! -f $f ] && echo "$0: no such file $f" && exit 1; +done + +# in this dir we'll have just one job. +sdata=$data/split$nj +utils/split_data.sh $data $nj + +mkdir -p $dir/log +echo $nj > $dir/num_jobs + + +# First work out the feature and iVector dimension, needed for tdnn config creation. +feat_dim=$(feat-to-dim --print-args=false scp:$data/feats.scp -) || \ + { echo "$0: Error getting feature dim"; exit 1; } + +if [ -z "$online_ivector_dir" ]; then + ivector_dim=0 +else + ivector_dim=$(feat-to-dim scp:$online_ivector_dir/ivector_online.scp -) || exit 1; +fi + +if [ ! -z "$configs_dir" ]; then + cp -rT $configs_dir $dir/configs || exit 1 +fi + +if [ $stage -le -5 ]; then + # Initialize as "raw" nnet, prior to training the LDA-like preconditioning + # matrix. This first config just does any initial splicing that we do; + # we do this as it's a convenient way to get the stats for the 'lda-like' + # transform. + $cmd $dir/log/nnet_init.log \ + nnet3-init --srand=-2 $dir/configs/init.config $dir/init.raw || exit 1; +fi + +# sourcing the "vars" below sets +# model_left_context=(something) +# model_right_context=(something) +# num_hidden_layers=(something) +# num_targets=(something) +# add_lda=(true|false) +# include_log_softmax=(true|false) +# objective_type=(something) +. $dir/configs/vars || exit 1; +left_context=$model_left_context +right_context=$model_right_context + +[ -z "$num_targets" ] && echo "\$num_targets is not defined. Needs to be defined in $dir/configs/vars." && exit 1 +[ -z "$add_lda" ] && echo "\$add_lda is not defined. Needs to be defined in $dir/configs/vars." && exit 1 +[ -z "$include_log_softmax" ] && echo "\$include_log_softmax is not defined. Needs to be defined in $dir/configs/vars." && exit 1 +[ -z "$objective_type" ] && echo "\$objective_type is not defined. Needs to be defined in $dir/configs/vars." && exit 1 + +context_opts="--left-context=$left_context --right-context=$right_context" + +! [ "$num_hidden_layers" -gt 0 ] && echo \ + "$0: Expected num_hidden_layers to be defined" && exit 1; + +if $dense_targets; then + tmp_num_targets=`feat-to-dim scp:$targets_scp - 2>/dev/null` || exit 1 + + if [ $tmp_num_targets -ne $num_targets ]; then + echo "Mismatch between num-targets provided to script vs configs" + exit 1 + fi +fi + +if [ $stage -le -4 ] && [ -z "$egs_dir" ]; then + extra_opts=() + [ ! -z "$cmvn_opts" ] && extra_opts+=(--cmvn-opts "$cmvn_opts") + [ ! -z "$feat_type" ] && extra_opts+=(--feat-type $feat_type) + [ ! -z "$online_ivector_dir" ] && extra_opts+=(--online-ivector-dir $online_ivector_dir) + extra_opts+=(--transform-dir "$transform_dir") + extra_opts+=(--left-context $left_context) + extra_opts+=(--right-context $right_context) + echo "$0: calling get_egs.sh" + + if $dense_targets; then + target_type=dense + else + target_type=sparse + fi + + steps/nnet3/get_egs_targets.sh $egs_opts "${extra_opts[@]}" \ + --samples-per-iter $samples_per_iter --stage $get_egs_stage \ + --cmd "$cmd" --nj $nj \ + --frames-per-eg $frames_per_eg \ + --target-type $target_type --num-targets $num_targets \ + $data $targets_scp $dir/egs || exit 1; +fi + +[ -z $egs_dir ] && egs_dir=$dir/egs + +if [ "$feat_dim" != "$(cat $egs_dir/info/feat_dim)" ]; then + echo "$0: feature dimension mismatch with egs, $feat_dim vs $(cat $egs_dir/info/feat_dim)"; + exit 1; +fi +if [ "$ivector_dim" != "$(cat $egs_dir/info/ivector_dim)" ]; then + echo "$0: ivector dimension mismatch with egs, $ivector_dim vs $(cat $egs_dir/info/ivector_dim)"; + exit 1; +fi + +# copy any of the following that exist, to $dir. +cp $egs_dir/{cmvn_opts,splice_opts,final.mat} $dir 2>/dev/null + +# confirm that the egs_dir has the necessary context (especially important if +# the --egs-dir option was used on the command line). +egs_left_context=$(cat $egs_dir/info/left_context) || exit -1 +egs_right_context=$(cat $egs_dir/info/right_context) || exit -1 + ( [ $egs_left_context -lt $left_context ] || \ + [ $egs_right_context -lt $right_context ] ) && \ + echo "$0: egs in $egs_dir have too little context" && exit -1; + +frames_per_eg=$(cat $egs_dir/info/frames_per_eg) || { echo "error: no such file $egs_dir/info/frames_per_eg"; exit 1; } +num_archives=$(cat $egs_dir/info/num_archives) || { echo "error: no such file $egs_dir/info/frames_per_eg"; exit 1; } + +# num_archives_expanded considers each separate label-position from +# 0..frames_per_eg-1 to be a separate archive. +if [ "$chunk_training" == "true" ]; then + num_archives_expanded=$num_archives +else + num_archives_expanded=$[$num_archives*$frames_per_eg] +fi + +[ $num_jobs_initial -gt $num_jobs_final ] && \ + echo "$0: --initial-num-jobs cannot exceed --final-num-jobs" && exit 1; + +[ $num_jobs_final -gt $num_archives_expanded ] && \ + echo "$0: --final-num-jobs cannot exceed #archives $num_archives_expanded." && exit 1; + + +if $add_lda && [ $stage -le -3 ]; then + echo "$0: getting preconditioning matrix for input features." + num_lda_jobs=$num_archives + [ $num_lda_jobs -gt $max_lda_jobs ] && num_lda_jobs=$max_lda_jobs + + # Write stats with the same format as stats for LDA. + $cmd JOB=1:$num_lda_jobs $dir/log/get_lda_stats.JOB.log \ + nnet3-acc-lda-stats --rand-prune=$rand_prune \ + $dir/init.raw "ark:$egs_dir/egs.JOB.ark" $dir/JOB.lda_stats || exit 1; + + all_lda_accs=$(for n in $(seq $num_lda_jobs); do echo $dir/$n.lda_stats; done) + $cmd $dir/log/sum_transform_stats.log \ + sum-lda-accs $dir/lda_stats $all_lda_accs || exit 1; + + rm $all_lda_accs || exit 1; + + # this computes a fixed affine transform computed in the way we described in + # Appendix C.6 of http://arxiv.org/pdf/1410.7455v6.pdf; it's a scaled variant + # of an LDA transform but without dimensionality reduction. + $cmd $dir/log/get_transform.log \ + nnet-get-feature-transform $lda_opts $dir/lda.mat $dir/lda_stats || exit 1; + + ln -sf ../lda.mat $dir/configs/lda.mat +fi + + +if [ $stage -le -1 ]; then + # Add the first layer; this will add in the lda.mat + $cmd $dir/log/add_first_layer.log \ + nnet3-init --srand=-3 $dir/init.raw $dir/configs/layer1.config $dir/0.raw || exit 1; + +fi + + +# set num_iters so that as close as possible, we process the data $num_epochs +# times, i.e. $num_iters*$avg_num_jobs) == $num_epochs*$num_archives_expanded, +# where avg_num_jobs=(num_jobs_initial+num_jobs_final)/2. + +num_archives_to_process=$[$num_epochs*$num_archives_expanded] +num_archives_processed=0 +num_iters=$[($num_archives_to_process*2)/($num_jobs_initial+$num_jobs_final)] + +finish_add_layers_iter=$[$num_hidden_layers * $add_layers_period] + +! [ $num_iters -gt $[$finish_add_layers_iter+2] ] \ + && echo "$0: Insufficient epochs" && exit 1 + +echo "$0: Will train for $num_epochs epochs = $num_iters iterations" + +if $use_gpu; then + parallel_suffix="" + train_queue_opt="--gpu 1" + combine_queue_opt="--gpu 1" + prior_gpu_opt="--use-gpu=yes" + prior_queue_opt="--gpu 1" + parallel_train_opts= + if ! cuda-compiled; then + echo "$0: WARNING: you are running with one thread but you have not compiled" + echo " for CUDA. You may be running a setup optimized for GPUs. If you have" + echo " GPUs and have nvcc installed, go to src/ and do ./configure; make" + exit 1 + fi +else + echo "$0: without using a GPU this will be very slow. nnet3 does not yet support multiple threads." + parallel_train_opts="--use-gpu=no" + combine_queue_opt="" # the combine stage will be quite slow if not using + # GPU, as we didn't enable that program to use + # multiple threads. + prior_gpu_opt="--use-gpu=no" + prior_queue_opt="" +fi + + +approx_iters_per_epoch_final=$[$num_archives_expanded/$num_jobs_final] +# First work out how many iterations we want to combine over in the final +# nnet3-combine-fast invocation. (We may end up subsampling from these if the +# number exceeds max_model_combine). The number we use is: +# min(max(max_models_combine, approx_iters_per_epoch_final), +# 1/2 * iters_after_last_layer_added) +num_iters_combine=$max_models_combine +if [ $num_iters_combine -lt $approx_iters_per_epoch_final ]; then + num_iters_combine=$approx_iters_per_epoch_final +fi +half_iters_after_add_layers=$[($num_iters-$finish_add_layers_iter)/2] +if [ $num_iters_combine -gt $half_iters_after_add_layers ]; then + num_iters_combine=$half_iters_after_add_layers +fi +first_model_combine=$[$num_iters-$num_iters_combine+1] + +x=0 + + +compute_accuracy=false +if [ "$objective_type" == "linear" ]; then + compute_accuracy=true +fi + +while [ $x -lt $num_iters ]; do + [ $x -eq $exit_stage ] && echo "$0: Exiting early due to --exit-stage $exit_stage" && exit 0; + + this_num_jobs=$(perl -e "print int(0.5+$num_jobs_initial+($num_jobs_final-$num_jobs_initial)*$x/$num_iters);") + + ilr=$initial_effective_lrate; flr=$final_effective_lrate; np=$num_archives_processed; nt=$num_archives_to_process; + this_learning_rate=$(perl -e "print (($x + 1 >= $num_iters ? $flr : $ilr*exp($np*log($flr/$ilr)/$nt))*$this_num_jobs);"); + + echo "On iteration $x, learning rate is $this_learning_rate." + + if [ $x -ge 0 ] && [ $stage -le $x ]; then + + # Set off jobs doing some diagnostics, in the background. + # Use the egs dir from the previous iteration for the diagnostics + $cmd $dir/log/compute_prob_valid.$x.log \ + nnet3-compute-prob --compute-accuracy=$compute_accuracy $dir/$x.raw \ + "ark,bg:nnet3-merge-egs ark:$egs_dir/valid_diagnostic.egs ark:- |" & + $cmd $dir/log/compute_prob_train.$x.log \ + nnet3-compute-prob --compute-accuracy=$compute_accuracy $dir/$x.raw \ + "ark,bg:nnet3-merge-egs ark:$egs_dir/train_diagnostic.egs ark:- |" & + + if [ $x -gt 0 ]; then + $cmd $dir/log/progress.$x.log \ + nnet3-show-progress --use-gpu=no $dir/$[x-1].raw $dir/$x.raw \ + "ark,bg:nnet3-merge-egs ark:$egs_dir/train_diagnostic.egs ark:-|" '&&' \ + nnet3-info $dir/$x.raw & + fi + + echo "Training neural net (pass $x)" + + if [ $x -gt 0 ] && \ + [ $x -le $[($num_hidden_layers-1)*$add_layers_period] ] && \ + [ $[$x%$add_layers_period] -eq 0 ]; then + do_average=false # if we've just mixed up, don't do averaging but take the + # best. + cur_num_hidden_layers=$[1+$x/$add_layers_period] + config=$dir/configs/layer$cur_num_hidden_layers.config + raw="nnet3-copy --learning-rate=$this_learning_rate $dir/$x.raw - | nnet3-init --srand=$x - $config - |" + else + do_average=true + if [ $x -eq 0 ]; then do_average=false; fi # on iteration 0, pick the best, don't average. + raw="nnet3-copy --learning-rate=$this_learning_rate $dir/$x.raw -|" + fi + if $do_average; then + this_minibatch_size=$minibatch_size + else + # on iteration zero or when we just added a layer, use a smaller minibatch + # size (and we will later choose the output of just one of the jobs): the + # model-averaging isn't always helpful when the model is changing too fast + # (i.e. it can worsen the objective function), and the smaller minibatch + # size will help to keep the update stable. + this_minibatch_size=$[$minibatch_size/2]; + fi + + rm $dir/.error 2>/dev/null + + + ( # this sub-shell is so that when we "wait" below, + # we only wait for the training jobs that we just spawned, + # not the diagnostic jobs that we spawned above. + + # We can't easily use a single parallel SGE job to do the main training, + # because the computation of which archive and which --frame option + # to use for each job is a little complex, so we spawn each one separately. + for n in $(seq $this_num_jobs); do + k=$[$num_archives_processed + $n - 1]; # k is a zero-based index that we'll derive + # the other indexes from. + archive=$[($k%$num_archives)+1]; # work out the 1-based archive index. + frame=$[(($k/$num_archives)%$frames_per_eg)]; # work out the 0-based frame + # index; this increases more slowly than the archive index because the + # same archive with different frame indexes will give similar gradients, + # so we want to separate them in time. + + $cmd $train_queue_opt $dir/log/train.$x.$n.log \ + nnet3-train $parallel_train_opts \ + --max-param-change=$max_param_change "$raw" \ + "ark,bg:nnet3-copy-egs --frame=$frame $context_opts ark:$egs_dir/egs.$archive.ark ark:- | nnet3-shuffle-egs --buffer-size=$shuffle_buffer_size --srand=$x ark:- ark:-| nnet3-merge-egs --minibatch-size=$this_minibatch_size --discard-partial-minibatches=true ark:- ark:- |" \ + $dir/$[$x+1].$n.raw || touch $dir/.error & + done + wait + ) + # the error message below is not that informative, but $cmd will + # have printed a more specific one. + [ -f $dir/.error ] && echo "$0: error on iteration $x of training" && exit 1; + + nnets_list= + for n in `seq 1 $this_num_jobs`; do + nnets_list="$nnets_list $dir/$[$x+1].$n.raw" + done + + if $do_average; then + # average the output of the different jobs. + $cmd $dir/log/average.$x.log \ + nnet3-average $nnets_list $dir/$[x+1].raw || exit 1; + else + # choose the best from the different jobs. + n=$(perl -e '($nj,$pat)=@ARGV; $best_n=1; $best_logprob=-1.0e+10; for ($n=1;$n<=$nj;$n++) { + $fn = sprintf($pat,$n); open(F, "<$fn") || die "Error opening log file $fn"; + undef $logprob; while () { if (m/log-prob-per-frame=(\S+)/) { $logprob=$1; } } + close(F); if (defined $logprob && $logprob > $best_logprob) { $best_logprob=$logprob; + $best_n=$n; } } print "$best_n\n"; ' $this_num_jobs $dir/log/train.$x.%d.log) || exit 1; + [ -z "$n" ] && echo "Error getting best model" && exit 1; + $cmd $dir/log/select.$x.log \ + nnet3-copy $dir/$[$x+1].$n.raw $dir/$[$x+1].raw || exit 1; + fi + + rm $nnets_list + [ ! -f $dir/$[$x+1].raw ] && exit 1; + if [ -f $dir/$[$x-1].raw ] && $cleanup && \ + [ $[($x-1)%100] -ne 0 ] && [ $[$x-1] -lt $first_model_combine ]; then + rm $dir/$[$x-1].raw + fi + fi + x=$[$x+1] + num_archives_processed=$[$num_archives_processed+$this_num_jobs] +done + +if [ $stage -le $num_iters ]; then + echo "Doing final combination to produce final.raw" + + # Now do combination. In the nnet3 setup, the logic + # for doing averaging of subsets of the models in the case where + # there are too many models to reliably esetimate interpolation + # factors (max_models_combine) is moved into the nnet3-combine + nnets_list=() + for n in $(seq 0 $[num_iters_combine-1]); do + iter=$[$first_model_combine+$n] + nnet=$dir/$iter.raw + [ ! -f $nnet ] && echo "Expected $nnet to exist" && exit 1; + nnets_list[$n]=$nnet + done + + # Below, we use --use-gpu=no to disable nnet3-combine-fast from using a GPU, + # as if there are many models it can give out-of-memory error; and we set + # num-threads to 8 to speed it up (this isn't ideal...) + + $cmd $combine_queue_opt $dir/log/combine.log \ + nnet3-combine --num-iters=40 \ + --enforce-sum-to-one=true --enforce-positive-weights=true \ + --verbose=3 "${nnets_list[@]}" "ark,bg:nnet3-merge-egs --minibatch-size=1024 ark:$egs_dir/combine.egs ark:-|" \ + $dir/final.raw || exit 1; + + # Compute the probability of the final, combined model with + # the same subset we used for the previous compute_probs, as the + # different subsets will lead to different probs. + $cmd $dir/log/compute_prob_valid.final.log \ + nnet3-compute-prob --compute-accuracy=$compute_accuracy $dir/final.raw \ + "ark,bg:nnet3-merge-egs ark:$egs_dir/valid_diagnostic.egs ark:- |" & + $cmd $dir/log/compute_prob_train.final.log \ + nnet3-compute-prob --compute-accuracy=$compute_accuracy $dir/final.raw \ + "ark,bg:nnet3-merge-egs ark:$egs_dir/train_diagnostic.egs ark:- |" & +fi + +if $include_log_softmax && [ $stage -le $[$num_iters+1] ]; then + echo "Getting average posterior for purpose of using as prior to convert posteriors to likelihoods." + # Note: this just uses CPUs, using a smallish subset of data. + if [ $num_jobs_compute_prior -gt $num_archives ]; then egs_part=1; + else egs_part=JOB; fi + rm $dir/post.$x.*.vec 2>/dev/null + $cmd JOB=1:$num_jobs_compute_prior $prior_queue_opt $dir/log/get_post.$x.JOB.log \ + nnet3-copy-egs --frame=random $context_opts --srand=JOB ark:$egs_dir/egs.$egs_part.ark ark:- \| \ + nnet3-subset-egs --srand=JOB --n=$prior_subset_size ark:- ark:- \| \ + nnet3-merge-egs ark:- ark:- \| \ + nnet3-compute-from-egs $prior_gpu_opt --apply-exp=true \ + $dir/final.raw ark:- ark:- \| \ + matrix-sum-rows ark:- ark:- \| vector-sum ark:- $dir/post.$x.JOB.vec || exit 1; + + sleep 3; # make sure there is time for $dir/post.$x.*.vec to appear. + + $cmd $dir/log/vector_sum.$x.log \ + vector-sum $dir/post.$x.*.vec $dir/post.$x.vec || exit 1; + + rm -f $dir/post.$x.*.vec; + +fi + + +if [ ! -f $dir/final.raw ]; then + echo "$0: $dir/final.raw does not exist." + # we don't want to clean up if the training didn't succeed. + exit 1; +fi + +sleep 2 + +echo Done + +if $cleanup; then + echo Cleaning up data + if $remove_egs && [[ $egs_dir =~ $dir/egs* ]]; then + steps/nnet2/remove_egs.sh $egs_dir + fi + + echo Removing most of the models + for x in `seq 0 $num_iters`; do + if [ $[$x%100] -ne 0 ] && [ $x -ne $num_iters ] && [ -f $dir/$x.mdl ]; then + # delete all but every 100th model; don't delete the ones which combine to form the final model. + rm $dir/$x.raw + fi + done +fi + diff --git a/egs/wsj/s5/steps/nnet3/train_discriminative.sh b/egs/wsj/s5/steps/nnet3/train_discriminative.sh new file mode 100755 index 00000000000..838ae311906 --- /dev/null +++ b/egs/wsj/s5/steps/nnet3/train_discriminative.sh @@ -0,0 +1,391 @@ +#!/bin/bash + +# Copyright 2012-2014 Johns Hopkins University (Author: Daniel Povey) +# 2014-2015 Vimal Manohar +# Apache 2.0. + +set -o pipefail + +# This script does MPE or MMI or state-level minimum bayes risk (sMBR) training +# using egs obtained by steps/nnet3/get_egs_discriminative.sh + +# Begin configuration section. +cmd=run.pl +num_epochs=4 # Number of epochs of training; + # the number of iterations is worked out from this. + # Be careful with this: we actually go over the data + # num-epochs * frame-subsampling-factor times, due to + # using different data-shifts. +use_gpu=true +truncate_deriv_weights=0 # can be used to set to zero the weights of derivs from frames + # near the edges. (counts subsampled frames). +apply_deriv_weights=true +use_frame_shift=false +run_diagnostics=true +learning_rate=0.00002 +max_param_change=2.0 +scale_max_param_change=false # if this option is used, scale it by num-jobs. + +effective_lrate= # If supplied, overrides the learning rate, which gets set to effective_lrate * num_jobs_nnet. +acoustic_scale=0.1 # acoustic scale for MMI/MPFE/SMBR training. +boost=0.0 # option relevant for MMI + +criterion=smbr +drop_frames=false # option relevant for MMI +one_silence_class=true # option relevant for MPE/SMBR +num_jobs_nnet=4 # Number of neural net jobs to run in parallel. Note: this + # will interact with the learning rates (if you decrease + # this, you'll have to decrease the learning rate, and vice + # versa). +regularization_opts= +minibatch_size=64 # This is the number of examples rather than the number of output frames. +modify_learning_rates=false +last_layer_factor=1.0 # relates to modify-learning-rates +first_layer_factor=1.0 # relates to modify-learning-rates +shuffle_buffer_size=1000 # This "buffer_size" variable controls randomization of the samples + # on each iter. You could set it to 0 or to a large value for complete + # randomization, but this would both consume memory and cause spikes in + # disk I/O. Smaller is easier on disk and memory but less random. It's + # not a huge deal though, as samples are anyway randomized right at the start. + + +stage=-3 + +adjust_priors=true +num_threads=16 # this is the default but you may want to change it, e.g. to 1 if + # using GPUs. + +cleanup=true +keep_model_iters=1 +retroactive=false +remove_egs=false +src_model= # will default to $degs_dir/final.mdl + +left_deriv_truncate= # number of time-steps to avoid using the deriv of, on the left. +right_deriv_truncate= # number of time-steps to avoid using the deriv of, on the right. +# End configuration section. + + +echo "$0 $@" # Print the command line for logging + +if [ -f path.sh ]; then . ./path.sh; fi +. parse_options.sh || exit 1; + + +if [ $# != 2 ]; then + echo "Usage: $0 [opts] " + echo " e.g.: $0 exp/nnet3/tdnn_sp_degs exp/nnet3/tdnn_sp_smbr" + echo "" + echo "Main options (for others, see top of script file)" + echo " --config # config file containing options" + echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs." + echo " --num-epochs <#epochs|4> # Number of epochs of training" + echo " --learning-rate # Learning rate to use" + echo " --effective-lrate # If supplied, learning rate will be set to" + echo " # this value times num-jobs-nnet." + echo " --num-jobs-nnet # Number of parallel jobs to use for main neural net" + echo " # training (will affect results as well as speed; try 8, 16)" + echo " # Note: if you increase this, you may want to also increase" + echo " # the learning rate. Also note: if there are fewer archives" + echo " # of egs than this, it will get reduced automatically." + echo " --num-threads # Number of parallel threads per job (will affect results" + echo " # as well as speed; may interact with batch size; if you increase" + echo " # this, you may want to decrease the batch size. With GPU, must be 1." + echo " --parallel-opts # extra options to pass to e.g. queue.pl for processes that" + echo " # use multiple threads... " + echo " --stage # Used to run a partially-completed training process from somewhere in" + echo " # the middle." + echo " --criterion # Training criterion: may be smbr, mmi or mpfe" + echo " --boost # Boosting factor for MMI (e.g., 0.1)" + echo " --drop-frames # Option that affects MMI training: if true, we exclude gradients from frames" + echo " # where the numerator transition-id is not in the denominator lattice." + echo " --one-silence-class # Option that affects MPE/SMBR training (will tend to reduce insertions)" + echo " --modify-learning-rates # If true, modify learning rates to try to equalize relative" + echo " # changes across layers." + exit 1; +fi + +degs_dir=$1 +dir=$2 + +[ -z "$src_model" ] && src_model=$degs_dir/final.mdl + +# Check some files. +for f in $degs_dir/degs.1.ark $degs_dir/info/{num_archives,silence.csl,frames_per_eg,egs_per_archive} $src_model; do + [ ! -f $f ] && echo "$0: no such file $f" && exit 1; +done + +mkdir -p $dir/log || exit 1; + +# copy some things +for f in splice_opts cmvn_opts tree final.mat; do + if [ -f $degs_dir/$f ]; then + cp $degs_dir/$f $dir/ || exit 1; + fi +done + +silphonelist=`cat $degs_dir/info/silence.csl` || exit 1; + +num_archives_priors=0 +if $adjust_priors; then + num_archives_priors=`cat $degs_dir/info/num_archives_priors` || exit 1 +fi + +frames_per_eg=$(cat $degs_dir/info/frames_per_eg) || { echo "error: no such file $degs_dir/info/frames_per_eg"; exit 1; } +num_archives=$(cat $degs_dir/info/num_archives) || exit 1; +frame_subsampling_factor=$(cat $degs_dir/info/frame_subsampling_factor) + +echo $frame_subsampling_factor > $dir/frame_subsampling_factor + +if $use_frame_shift; then + num_archives_expanded=$[$num_archives*$frame_subsampling_factor] +else + num_archives_expanded=$num_archives +fi + +if [ $num_jobs_nnet -gt $num_archives_expanded ]; then + echo "$0: num-jobs-nnet $num_jobs_nnet exceeds number of archives $num_archives_expanded," + echo " ... setting it to $num_archives." + num_jobs_nnet=$num_archives_expanded +fi + +num_archives_to_process=$[$num_epochs*$num_archives_expanded] +num_archives_processed=0 +num_iters=$[$num_archives_to_process/$num_jobs_nnet] + +echo "$0: Will train for $num_epochs epochs = $num_iters iterations" + +if $use_gpu; then + parallel_suffix="" + train_queue_opt="--gpu 1" + parallel_train_opts= + if ! cuda-compiled; then + echo "$0: WARNING: you are running with one thread but you have not compiled" + echo " for CUDA. You may be running a setup optimized for GPUs. If you have" + echo " GPUs and have nvcc installed, go to src/ and do ./configure; make" + exit 1 + fi +else + echo "$0: without using a GPU this will be very slow. nnet3 does not yet support multiple threads." + parallel_train_opts="--use-gpu=no" +fi + +if $use_frame_shift; then + num_epochs_expanded=$[num_epochs*frame_subsampling_factor] +else + num_epochs_expanded=$num_epochs +fi + +for e in $(seq 1 $num_epochs_expanded); do + x=$[($e*$num_archives)/$num_jobs_nnet] # gives the iteration number. + iter_to_epoch[$x]=$e +done + +if [ $stage -le -1 ]; then + echo "$0: Copying initial model and modifying preconditioning setup" + + # Note, the baseline model probably had preconditioning, and we'll keep it; + # but we want online preconditioning with a larger number of samples of + # history, since in this setup the frames are only randomized at the segment + # level so they are highly correlated. It might make sense to tune this a + # little, later on, although I doubt it matters once the --num-samples-history + # is large enough. + + if [ ! -z "$effective_lrate" ]; then + learning_rate=$(perl -e "print ($num_jobs_nnet*$effective_lrate);") + echo "$0: setting learning rate to $learning_rate = --num-jobs-nnet * --effective-lrate." + fi + + $cmd $dir/log/convert.log \ + nnet3-am-copy --learning-rate=$learning_rate "$src_model" $dir/0.mdl || exit 1; +fi + + +rm $dir/.error 2>/dev/null + +x=0 + +deriv_time_opts= +[ ! -z "$left_deriv_truncate" ] && deriv_time_opts="--optimization.min-deriv-time=$left_deriv_truncate" +[ ! -z "$right_deriv_truncate" ] && \ + deriv_time_opts="$deriv_time_opts --optimization.max-deriv-time=$((frames_per_eg - right_deriv_truncate))" + +while [ $x -lt $num_iters ]; do + if [ $stage -le $x ]; then + if $run_diagnostics; then + # Set off jobs doing some diagnostics, in the background. # Use the egs dir from the previous iteration for the diagnostics + $cmd $dir/log/compute_objf_valid.$x.log \ + nnet3-discriminative-compute-objf $regularization_opts \ + --silence-phones=$silphonelist \ + --criterion=$criterion --drop-frames=$drop_frames \ + --one-silence-class=$one_silence_class \ + --boost=$boost --acoustic-scale=$acoustic_scale \ + $dir/$x.mdl \ + ark:$degs_dir/valid_diagnostic.degs & + $cmd $dir/log/compute_objf_train.$x.log \ + nnet3-discriminative-compute-objf $regularization_opts \ + --silence-phones=$silphonelist \ + --criterion=$criterion --drop-frames=$drop_frames \ + --one-silence-class=$one_silence_class \ + --boost=$boost --acoustic-scale=$acoustic_scale \ + $dir/$x.mdl \ + ark:$degs_dir/train_diagnostic.degs & + fi + + if [ $x -gt 0 ]; then + $cmd $dir/log/progress.$x.log \ + nnet3-show-progress --use-gpu=no "nnet3-am-copy --raw=true $dir/$[$x-1].mdl - |" "nnet3-am-copy --raw=true $dir/$x.mdl - |" \ + '&&' \ + nnet3-info "nnet3-am-copy --raw=true $dir/$x.mdl - |" & + fi + + + echo "Training neural net (pass $x)" + + cache_read_opt="--read-cache=$dir/cache.$x" + + ( # this sub-shell is so that when we "wait" below, + # we only wait for the training jobs that we just spawned, + # not the diagnostic jobs that we spawned above. + + # We can't easily use a single parallel SGE job to do the main training, + # because the computation of which archive and which --frame option + # to use for each job is a little complex, so we spawn each one separately. + for n in `seq $num_jobs_nnet`; do + k=$[$num_archives_processed + $n - 1]; # k is a zero-based index that we'll derive + # the other indexes from. + archive=$[($k%$num_archives)+1]; # work out the 1-based archive index. + + if [ $n -eq 1 ]; then + # an option for writing cache (storing pairs of nnet-computations and + # computation-requests) during training. + cache_write_opt=" --write-cache=$dir/cache.$[$x+1]" + else + cache_write_opt="" + fi + + if $use_frame_shift; then + if [ $[num_archives % frame_subsampling_factor] -ne 0 ]; then + frame_shift=$[k % frame_subsampling_factor] + else + frame_shift=$[(k + k/num_archives) % frame_subsampling_factor] + fi + else + frame_shift=0 + fi + + #archive=$[(($n+($x*$num_jobs_nnet))%$num_archives)+1] + if $scale_max_param_change; then + this_max_param_change=$(perl -e "print ($max_param_change * $num_jobs_nnet);") + else + this_max_param_change=$max_param_change + fi + + $cmd $train_queue_opt $dir/log/train.$x.$n.log \ + nnet3-discriminative-train $cache_read_opt $cache_write_opt \ + --apply-deriv-weights=$apply_deriv_weights \ + $parallel_train_opts $deriv_time_opts \ + --max-param-change=$this_max_param_change \ + --silence-phones=$silphonelist \ + --criterion=$criterion --drop-frames=$drop_frames \ + --one-silence-class=$one_silence_class \ + --boost=$boost --acoustic-scale=$acoustic_scale $regularization_opts \ + $dir/$x.mdl \ + "ark:nnet3-discriminative-copy-egs --frame-shift=$frame_shift --truncate-deriv-weights=$truncate_deriv_weights ark:$degs_dir/degs.$archive.ark ark:- | nnet3-discriminative-shuffle-egs --buffer-size=$shuffle_buffer_size --srand=$x ark:- ark:- | nnet3-discriminative-merge-egs --minibatch-size=$minibatch_size ark:- ark:- |" \ + $dir/$[$x+1].$n.raw || touch $dir/.error & + done + wait + [ -f $dir/.error ] && exit 1 + ) + [ -f $dir/.error ] && { echo "Found $dir/.error. See $dir/log/train.$x.*.log"; exit 1; } + + nnets_list=$(for n in $(seq $num_jobs_nnet); do echo $dir/$[$x+1].$n.raw; done) + + # below use run.pl instead of a generic $cmd for these very quick stages, + # so that we don't run the risk of waiting for a possibly hard-to-get GPU. + run.pl $dir/log/average.$x.log \ + nnet3-average $nnets_list - \| \ + nnet3-am-copy --set-raw-nnet=- $dir/$x.mdl $dir/$[$x+1].mdl || exit 1; + + if $modify_learning_rates; then + run.pl $dir/log/modify_learning_rates.$x.log \ + nnet3-modify-learning-rates --retroactive=$retroactive \ + --last-layer-factor=$last_layer_factor \ + --first-layer-factor=$first_layer_factor \ + "nnet3-am-copy --raw $dir/$x.mdl -|" "nnet3-am-copy --raw $dir/$[$x+1].mdl -|" - \| \ + nnet3-am-copy --set-raw-nnet=- $dir/$x.mdl $dir/$[$x+1].mdl || exit 1; + fi + rm $nnets_list + + if [ ! -z "${iter_to_epoch[$x]}" ]; then + e=${iter_to_epoch[$x]} + ln -sf $x.mdl $dir/epoch$e.mdl + fi + + if $adjust_priors && [ ! -z "${iter_to_epoch[$x]}" ]; then + if [ ! -f $degs_dir/priors_egs.1.ark ]; then + echo "$0: Expecting $degs_dir/priors_egs.1.ark to exist since --adjust-priors was true." + echo "$0: Run this script with --adjust-priors false to not adjust priors" + exit 1 + fi + ( + e=${iter_to_epoch[$x]} + rm $dir/.error 2> /dev/null + + steps/nnet3/adjust_priors.sh --egs-type priors_egs \ + --num-jobs-compute-prior $num_archives_priors \ + --cmd "$cmd" --use-gpu false \ + --use-raw-nnet false --iter epoch$e $dir $degs_dir \ + || { touch $dir/.error; echo "Error in adjusting priors. See $dir/log/adjust_priors.epoch$e.log"; exit 1; } + ) & + fi + + [ -f $dir/.error ] && { echo "Found $dir/.error. Error on iteration $x"; exit 1; } + fi + + rm $dir/cache.$x 2>/dev/null || true + x=$[$x+1] + num_archives_processed=$[num_archives_processed+num_jobs_nnet] +done + +rm $dir/final.mdl 2>/dev/null +cp $dir/$x.mdl $dir/final.mdl +ln -sf final.mdl $dir/epoch$num_epochs_expanded.mdl + +if $adjust_priors && [ $stage -le $num_iters ]; then + if [ ! -f $degs_dir/priors_egs.1.ark ]; then + echo "$0: Expecting $degs_dir/priors_egs.1.ark to exist since --adjust-priors was true." + echo "$0: Run this script with --adjust-priors false to not adjust priors" + exit 1 + fi + + steps/nnet3/adjust_priors.sh --egs-type priors_egs \ + --num-jobs-compute-prior $num_archives_priors \ + --cmd "$cmd $prior_queue_opt" --use-gpu false \ + --use-raw-nnet false --iter epoch$num_epochs_expanded \ + $dir $degs_dir || exit 1 +fi + +echo Done + + +# function to remove egs that might be soft links. +remove () { for x in $*; do [ -L $x ] && rm $(readlink -f $x); rm $x; done } + +if $cleanup && $remove_egs; then # note: this is false by default. + echo Removing training examples + remove $degs_dir/degs.* + remove $degs_dir/priors_egs.* +fi + + +if $cleanup; then + echo Removing most of the models + for x in `seq 1 $keep_model_iters $num_iters`; do + if [ -z "${iter_to_epoch[$x]}" ]; then + # if $x is not an epoch-final iteration.. + rm $dir/$x.mdl 2>/dev/null + fi + done +fi + diff --git a/egs/wsj/s5/steps/nnet3/train_dnn.py b/egs/wsj/s5/steps/nnet3/train_dnn.py new file mode 100755 index 00000000000..e56baed97a9 --- /dev/null +++ b/egs/wsj/s5/steps/nnet3/train_dnn.py @@ -0,0 +1,632 @@ +#!/usr/bin/env python + + +# Copyright 2016 Vijayaditya Peddinti. +# Apache 2.0. + + +# this script is based on steps/nnet3/lstm/train.sh + + +import subprocess +import argparse +import sys +import pprint +import logging +import imp +import traceback +from nnet3_train_lib import * + +nnet3_log_parse = imp.load_source('', 'steps/nnet3/report/nnet3_log_parse_lib.py') + +logger = logging.getLogger(__name__) +logger.setLevel(logging.INFO) +handler = logging.StreamHandler() +handler.setLevel(logging.INFO) +formatter = logging.Formatter('%(asctime)s [%(filename)s:%(lineno)s - %(funcName)s - %(levelname)s ] %(message)s') +handler.setFormatter(formatter) +logger.addHandler(handler) +logger.info('Starting RNN trainer (train_rnn.py)') + + +def GetArgs(): + # we add compulsary arguments as named arguments for readability + parser = argparse.ArgumentParser(description=""" + Trains a feed forward DNN acoustic model using the cross-entropy objective. + DNNs include simple DNNs, TDNNs and CNNs. + """, + formatter_class=argparse.ArgumentDefaultsHelpFormatter) + + # feat options + parser.add_argument("--feat.online-ivector-dir", type=str, dest='online_ivector_dir', + default = None, action = NullstrToNoneAction, + help="""directory with the ivectors extracted in + an online fashion.""") + parser.add_argument("--feat.cmvn-opts", type=str, dest='cmvn_opts', + default = None, action = NullstrToNoneAction, + help="A string specifying '--norm-means' and '--norm-vars' values") + + # egs extraction options + parser.add_argument("--egs.frames-per-eg", type=int, dest='frames_per_eg', + default = 8, + help="Number of output labels per example") + parser.add_argument("--egs.transform_dir", type=str, dest='transform_dir', + default = None, action = NullstrToNoneAction, + help="""String to provide options directly to steps/nnet3/get_egs.sh script""") + parser.add_argument("--egs.dir", type=str, dest='egs_dir', + default = None, action = NullstrToNoneAction, + help="""Directory with egs. If specified this directory + will be used rather than extracting egs""") + parser.add_argument("--egs.stage", type=int, dest='egs_stage', + default = 0, help="Stage at which get_egs.sh should be restarted") + parser.add_argument("--egs.opts", type=str, dest='egs_opts', + default = None, action = NullstrToNoneAction, + help="""String to provide options directly to steps/nnet3/get_egs.sh script""") + + # trainer options + parser.add_argument("--trainer.num-epochs", type=int, dest='num_epochs', + default = 8, + help="Number of epochs to train the model") + parser.add_argument("--trainer.prior-subset-size", type=int, dest='prior_subset_size', + default = 20000, + help="Number of samples for computing priors") + parser.add_argument("--trainer.num-jobs-compute-prior", type=int, dest='num_jobs_compute_prior', + default = 10, + help="The prior computation jobs are single threaded and run on the CPU") + parser.add_argument("--trainer.max-models-combine", type=int, dest='max_models_combine', + default = 20, + help="The maximum number of models used in the final model combination stage. These models will themselves be averages of iteration-number ranges") + parser.add_argument("--trainer.shuffle-buffer-size", type=int, dest='shuffle_buffer_size', + default = 5000, + help="Controls randomization of the samples on each" + "iteration. If 0 or a large value the randomization is" + "complete, but this will consume memory and cause spikes" + "in disk I/O. Smaller is easier on disk and memory but" + "less random. It's not a huge deal though, as samples" + "are anyway randomized right at the start." + "(the point of this is to get data in different" + "minibatches on different iterations, since in the" + "preconditioning method, 2 samples in the same minibatch" + "can affect each others' gradients.") + parser.add_argument("--trainer.add-layers-period", type=int, dest='add_layers_period', + default=2, + help="The number of iterations between adding layers" + "during layer-wise discriminative training.") + parser.add_argument("--trainer.max-param-change", type=float, dest='max_param_change', + default=2.0, + help="The maximum change in parameters allowed per minibatch," + "measured in Frobenius norm over the entire model") + parser.add_argument("--trainer.samples-per-iter", type=int, dest='samples_per_iter', + default=400000, + help="This is really the number of egs in each archive.") + parser.add_argument("--trainer.lda.rand-prune", type=float, dest='rand_prune', + default=4.0, + help="""Value used in preconditioning matrix estimation""") + parser.add_argument("--trainer.lda.max-lda-jobs", type=float, dest='max_lda_jobs', + default=10, + help="""Max number of jobs used for LDA stats accumulation""") + parser.add_argument("--trainer.presoftmax-prior-scale-power", type=float, dest='presoftmax_prior_scale_power', + default=-0.25, + help="") + + # Realignment parameters + parser.add_argument("--trainer.realign.command", type=str, dest='realign_command', + default=None, action=NullstrToNoneAction, + help="""Command to be used with steps/nnet3/align.sh during realignment""") + parser.add_argument("--trainer.realign.num-jobs", type=int, dest='realign_num_jobs', + default=30, + help="Number of jobs to use for realignment") + parser.add_argument("--trainer.realign.times", type=str, dest='realign_times', + default=None, action=NullstrToNoneAction, + help="""A space seperated string of realignment + times. Values must be between 0 and 1 + e.g. '0.1 0.2 0.3' """) + + parser.add_argument("--trainer.realign.use_gpu", type=str, dest='realign_use_gpu', + default=True, action=StrToBoolAction, + choices = ["true", "false"], + help="If true, gpu is used with steps/nnet3/align.sh") + + # Parameters for the optimization + parser.add_argument("--trainer.optimization.minibatch-size", type=float, dest='minibatch_size', + default = 512, + help="Size of the minibatch used to compute the gradient") + parser.add_argument("--trainer.optimization.initial-effective-lrate", type=float, dest='initial_effective_lrate', + default = 0.0003, + help="Learning rate used during the initial iteration") + parser.add_argument("--trainer.optimization.final-effective-lrate", type=float, dest='final_effective_lrate', + default = 0.00003, + help="Learning rate used during the final iteration") + parser.add_argument("--trainer.optimization.num-jobs-initial", type=int, dest='num_jobs_initial', + default = 1, + help="Number of neural net jobs to run in parallel at the start of training") + parser.add_argument("--trainer.optimization.num-jobs-final", type=int, dest='num_jobs_final', + default = 8, + help="Number of neural net jobs to run in parallel at the end of training") + parser.add_argument("--trainer.optimization.max-models-combine", type=int, dest='max_models_combine', + default = 20, + help = """ The is the maximum number of models we give to the + final 'combine' stage, but these models will themselves + be averages of iteration-number ranges. """) + parser.add_argument("--trainer.optimization.momentum", type=float, dest='momentum', + default = 0.0, + help="""Momentum used in update computation. + Note: we implemented it in such a way that + it doesn't increase the effective learning rate.""") + # General options + parser.add_argument("--stage", type=int, default=-4, + help="Specifies the stage of the experiment to execution from") + parser.add_argument("--exit-stage", type=int, default=None, + help="If specified, training exits before running this stage") + parser.add_argument("--cmd", type=str, action = NullstrToNoneAction, + dest = "command", + help="""Specifies the script to launch jobs. + e.g. queue.pl for launching on SGE cluster + run.pl for launching on local machine + """, default = "queue.pl") + parser.add_argument("--use-gpu", type=str, action = StrToBoolAction, + choices = ["true", "false"], + help="Use GPU for training", default=True) + parser.add_argument("--cleanup", type=str, action = StrToBoolAction, + choices = ["true", "false"], + help="Clean up models after training", default=True) + parser.add_argument("--cleanup.remove-egs", type=str, dest='remove_egs', + default = True, action = StrToBoolAction, + choices = ["true", "false"], + help="""If true, remove egs after experiment""") + parser.add_argument("--cleanup.preserve-model-interval", dest = "preserve_model_interval", + type=int, default=100, + help="Determines iterations for which models will be preserved during cleanup. If iter % preserve_model_interval == 0 model will be preserved.") + + parser.add_argument("--reporting.email", dest = "email", + type=str, default=None, action = NullstrToNoneAction, + help=""" Email-id to report about the progress of the experiment. + NOTE: It assumes the machine on which the script is being run can send + emails from command line via. mail program. The + Kaldi mailing list will not support this feature. + It might require local expertise to setup. """) + parser.add_argument("--reporting.interval", dest = "reporting_interval", + type=int, default=0.1, + help="Frequency with which reports have to be sent, measured in terms of fraction of iterations. If 0 and reporting mail has been specified then only failure notifications are sent") + + parser.add_argument("--feat-dir", type=str, required = True, + help="Directory with features used for training the neural network.") + parser.add_argument("--lang", type=str, required = True, + help="Languade directory") + parser.add_argument("--ali-dir", type=str, required = True, + help="Directory with alignments used for training the neural network.") + parser.add_argument("--dir", type=str, required = True, + help="Directory to store the models and all other files.") + + print(' '.join(sys.argv)) + + args = parser.parse_args() + + [args, run_opts] = ProcessArgs(args) + + return [args, run_opts] + +def ProcessArgs(args): + # process the options + if args.frames_per_eg < 1: + raise Exception("--egs.frames-per-eg should have a minimum value of 1") + + if (not os.path.exists(args.dir)) or (not os.path.exists(args.dir+"/configs")): + raise Exception("This scripts expects {0} to exist and have a configs" + " directory which is the output of make_configs.py script") + + if args.transform_dir is None: + args.transform_dir = args.ali_dir + # set the options corresponding to args.use_gpu + run_opts = RunOpts() + if args.use_gpu: + if not CheckIfCudaCompiled(): + logger.warning(""" + You are running with one thread but you have not compiled + for CUDA. You may be running a setup optimized for GPUs. If you have + GPUs and have nvcc installed, go to src/ and do ./configure; make""") + + run_opts.train_queue_opt = "--gpu 1" + run_opts.parallel_train_opts = "" + run_opts.combine_queue_opt = "--gpu 1" + run_opts.prior_gpu_opt = "--use-gpu=yes" + run_opts.prior_queue_opt = "--gpu 1" + + else: + logger.warning(""" + Without using a GPU this will be very slow. nnet3 does not yet support multiple threads.""") + + run_opts.train_queue_opt = "" + run_opts.parallel_train_opts = "--use-gpu=no" + run_opts.combine_queue_opt = "" + run_opts.prior_gpu_opt = "--use-gpu=no" + run_opts.prior_queue_opt = "" + + if args.realign_use_gpu is True: + run_opts.realign_use_gpu = True + run_opts.realign_queue_opt = "--gpu 1" + else: + run_opts.realign_use_gpu = False + run_opts.realign_queue_opt = "" + + if args.realign_command is None: + run_opts.realign_command = args.command + else: + run_opts.realign_command = args.realign_command + run_opts.realign_num_jobs = args.realign_num_jobs + + run_opts.command = args.command + run_opts.num_jobs_compute_prior = args.num_jobs_compute_prior + + return [args, run_opts] + +# a class to store run options +class RunOpts: + def __init__(self): + self.command = None + self.train_queue_opt = None + self.combine_queue_opt = None + self.prior_gpu_opt = None + self.prior_queue_opt = None + self.parallel_train_opts = None + self.realign_use_gpu = None + +# this is the main method which differs between RNN and DNN training +def TrainNewModels(dir, iter, num_jobs, num_archives_processed, num_archives, + raw_model_string, egs_dir, frames_per_eg, + left_context, right_context, + momentum, max_param_change, + shuffle_buffer_size, minibatch_size, + run_opts): + # We cannot easily use a single parallel SGE job to do the main training, + # because the computation of which archive and which --frame option + # to use for each job is a little complex, so we spawn each one separately. + # this is no longer true for RNNs as we use do not use the --frame option + # but we use the same script for consistency with FF-DNN code + + context_opts="--left-context={0} --right-context={1}".format( + left_context, right_context) + processes = [] + for job in range(1,num_jobs+1): + k = num_archives_processed + job - 1 # k is a zero-based index that we will derive + # the other indexes from. + archive_index = (k % num_archives) + 1 # work out the 1-based archive index. + frame = (k / num_archives) % frames_per_eg + process_handle = RunKaldiCommand(""" +{command} {train_queue_opt} {dir}/log/train.{iter}.{job}.log \ + nnet3-train {parallel_train_opts} \ + --print-interval=10 --momentum={momentum} \ + --max-param-change={max_param_change} \ + "{raw_model}" \ + "ark,bg:nnet3-copy-egs --frame={frame} {context_opts} ark:{egs_dir}/egs.{archive_index}.ark ark:- | nnet3-shuffle-egs --buffer-size={shuffle_buffer_size} --srand={iter} ark:- ark:-| nnet3-merge-egs --minibatch-size={minibatch_size} --measure-output-frames=false --discard-partial-minibatches=true ark:- ark:- |" \ + {dir}/{next_iter}.{job}.raw + """.format(command = run_opts.command, + train_queue_opt = run_opts.train_queue_opt, + dir = dir, iter = iter, next_iter = iter + 1, job = job, + parallel_train_opts = run_opts.parallel_train_opts, + frame = frame, + momentum = momentum, max_param_change = max_param_change, + raw_model = raw_model_string, context_opts = context_opts, + egs_dir = egs_dir, archive_index = archive_index, + shuffle_buffer_size = shuffle_buffer_size, + minibatch_size = minibatch_size), + wait = False) + + processes.append(process_handle) + + all_success = True + for process in processes: + process.wait() + [stdout_value, stderr_value] = process.communicate() + print(stderr_value) + if process.returncode != 0: + all_success = False + + if not all_success: + open('{0}/.error'.format(dir), 'w').close() + raise Exception("There was error during training iteration {0}".format(iter)) + +def TrainOneIteration(dir, iter, egs_dir, + num_jobs, num_archives_processed, num_archives, + learning_rate, minibatch_size, + frames_per_eg, num_hidden_layers, add_layers_period, + left_context, right_context, + momentum, max_param_change, shuffle_buffer_size, + run_opts): + + + + # Set off jobs doing some diagnostics, in the background. + # Use the egs dir from the previous iteration for the diagnostics + logger.info("Training neural net (pass {0})".format(iter)) + + ComputeTrainCvProbabilities(dir, iter, egs_dir, run_opts) + + if iter > 0: + ComputeProgress(dir, iter, egs_dir, run_opts) + + if iter > 0 and (iter <= (num_hidden_layers-1) * add_layers_period) and (iter % add_layers_period == 0): + + do_average = False # if we've just mixed up, don't do averaging but take the + # best. + cur_num_hidden_layers = 1 + iter / add_layers_period + config_file = "{0}/configs/layer{1}.config".format(dir, cur_num_hidden_layers) + raw_model_string = "nnet3-am-copy --raw=true --learning-rate={lr} {dir}/{iter}.mdl - | nnet3-init --srand={iter} - {config} - |".format(lr=learning_rate, dir=dir, iter=iter, config=config_file ) + else: + do_average = True + if iter == 0: + do_average = False # on iteration 0, pick the best, don't average. + raw_model_string = "nnet3-am-copy --raw=true --learning-rate={0} {1}/{2}.mdl - |".format(learning_rate, dir, iter) + + if do_average: + cur_minibatch_size = minibatch_size + cur_max_param_change = max_param_change + else: + # on iteration zero or when we just added a layer, use a smaller minibatch + # size (and we will later choose the output of just one of the jobs): the + # model-averaging isn't always helpful when the model is changing too fast + # (i.e. it can worsen the objective function), and the smaller minibatch + # size will help to keep the update stable. + cur_minibatch_size = minibatch_size / 2 + cur_max_param_change = float(max_param_change) / math.sqrt(2) + + try: + os.remove("{0}/.error".format(dir)) + except OSError: + pass + + TrainNewModels(dir, iter, num_jobs, num_archives_processed, num_archives, + raw_model_string, egs_dir, frames_per_eg, + left_context, right_context, + momentum, max_param_change, + shuffle_buffer_size, cur_minibatch_size, + run_opts) + [models_to_average, best_model] = GetSuccessfulModels(num_jobs, '{0}/log/train.{1}.%.log'.format(dir,iter)) + nnets_list = [] + for n in models_to_average: + nnets_list.append("{0}/{1}.{2}.raw".format(dir, iter + 1, n)) + + if do_average: + # average the output of the different jobs. + RunKaldiCommand(""" +{command} {dir}/log/average.{iter}.log \ +nnet3-average {nnet_list} - \| \ +nnet3-am-copy --set-raw-nnet=- {dir}/{iter}.mdl {dir}/{new_iter}.mdl + """.format(command = run_opts.command, + dir = dir, + iter = iter, + nnet_list = " ".join(nnets_list), + new_iter = iter + 1)) + + else: + # choose the best model from different jobs + RunKaldiCommand(""" +{command} {dir}/log/select.{iter}.log \ + nnet3-am-copy --set-raw-nnet={dir}/{next_iter}.{best_model_index}.raw {dir}/{iter}.mdl {dir}/{next_iter}.mdl + """.format(command = run_opts.command, + dir = dir, iter = iter, next_iter = iter + 1, + best_model_index = best_model)) + + try: + for i in range(1, num_jobs + 1): + os.remove("{0}/{1}.{2}.raw".format(dir, iter + 1, i)) + except OSError: + raise Exception("Error while trying to delete the raw models") + + new_model = "{0}/{1}.mdl".format(dir, iter + 1) + + if not os.path.isfile(new_model): + raise Exception("Could not find {0}, at the end of iteration {1}".format(new_model, iter)) + elif os.stat(new_model).st_size == 0: + raise Exception("{0} has size 0. Something went wrong in iteration {1}".format(new_model, iter)) + +# args is a Namespace with the required parameters +def Train(args, run_opts): + arg_string = pprint.pformat(vars(args)) + logger.info("Arguments for the experiment\n{0}".format(arg_string)) + + # Set some variables. + num_leaves = GetNumberOfLeaves(args.ali_dir) + num_jobs = GetNumberOfJobs(args.ali_dir) + feat_dim = GetFeatDim(args.feat_dir) + ivector_dim = GetIvectorDim(args.online_ivector_dir) + + # split the training data into parts for individual jobs + # we will use the same number of jobs as that used for alignment + SplitData(args.feat_dir, num_jobs) + shutil.copy('{0}/tree'.format(args.ali_dir), args.dir) + f = open('{0}/num_jobs'.format(args.dir), 'w') + f.write(str(num_jobs)) + f.close() + + config_dir = '{0}/configs'.format(args.dir) + var_file = '{0}/vars'.format(config_dir) + + [left_context, right_context, num_hidden_layers] = ParseModelConfigVarsFile(var_file) + # Initialize as "raw" nnet, prior to training the LDA-like preconditioning + # matrix. This first config just does any initial splicing that we do; + # we do this as it's a convenient way to get the stats for the 'lda-like' + # transform. + + if (args.stage <= -5): + logger.info("Initializing a basic network for estimating preconditioning matrix") + RunKaldiCommand(""" +{command} {dir}/log/nnet_init.log \ + nnet3-init --srand=-2 {dir}/configs/init.config {dir}/init.raw + """.format(command = run_opts.command, + dir = args.dir)) + + default_egs_dir = '{0}/egs'.format(args.dir) + if (args.stage <= -4) and args.egs_dir is None: + logger.info("Generating egs") + + GenerateEgs(args.feat_dir, args.ali_dir, default_egs_dir, + left_context, right_context, + left_context, right_context, run_opts, + frames_per_eg = args.frames_per_eg, + egs_opts = args.egs_opts, + cmvn_opts = args.cmvn_opts, + online_ivector_dir = args.online_ivector_dir, + samples_per_iter = args.samples_per_iter, + transform_dir = args.transform_dir, + stage = args.egs_stage) + + if args.egs_dir is None: + egs_dir = default_egs_dir + else: + egs_dir = args.egs_dir + + [egs_left_context, egs_right_context, frames_per_eg, num_archives] = VerifyEgsDir(egs_dir, feat_dim, ivector_dim, left_context, right_context) + assert(args.frames_per_eg == frames_per_eg) + + if (args.num_jobs_final > num_archives): + raise Exception('num_jobs_final cannot exceed the number of archives in the egs directory') + + # copy the properties of the egs to dir for + # use during decoding + CopyEgsPropertiesToExpDir(egs_dir, args.dir) + + if (args.stage <= -3): + logger.info('Computing the preconditioning matrix for input features') + + ComputePreconditioningMatrix(args.dir, egs_dir, num_archives, run_opts, + max_lda_jobs = args.max_lda_jobs, + rand_prune = args.rand_prune) + + if (args.stage <= -2): + logger.info("Computing initial vector for FixedScaleComponent before" + " softmax, using priors^{prior_scale} and rescaling to" + " average 1".format(prior_scale = args.presoftmax_prior_scale_power)) + + ComputePresoftmaxPriorScale(args.dir, args.ali_dir, num_jobs, run_opts, + presoftmax_prior_scale_power = args.presoftmax_prior_scale_power) + + + if (args.stage <= -1): + logger.info("Preparing the initial acoustic model.") + PrepareInitialAcousticModel(args.dir, args.ali_dir, run_opts) + + + # set num_iters so that as close as possible, we process the data $num_epochs + # times, i.e. $num_iters*$avg_num_jobs) == $num_epochs*$num_archives, + # where avg_num_jobs=(num_jobs_initial+num_jobs_final)/2. + num_archives_expanded = num_archives * args.frames_per_eg + num_archives_to_process = args.num_epochs * num_archives_expanded + num_archives_processed = 0 + num_iters=(num_archives_to_process * 2) / (args.num_jobs_initial + args.num_jobs_final) + + num_iters_combine = VerifyIterations(num_iters, args.num_epochs, + num_hidden_layers, num_archives_expanded, + args.max_models_combine, args.add_layers_period, + args.num_jobs_final) + + learning_rate = lambda iter, current_num_jobs, num_archives_processed: GetLearningRate(iter, current_num_jobs, num_iters, + num_archives_processed, + num_archives_to_process, + args.initial_effective_lrate, + args.final_effective_lrate) + realign_iters = [] + if args.realign_times is not None: + realign_iters = GetRealignIters(args.realign_times, + num_iters, + args.num_jobs_initial, + args.num_jobs_final) + print(realign_iters) + # egs_dir will be updated if there is realignment + cur_egs_dir=egs_dir + + logger.info("Training will run for {0} epochs = {1} iterations".format(args.num_epochs, num_iters)) + for iter in range(num_iters): + if (args.exit_stage is not None) and (iter == args.exit_stage): + logger.info("Exiting early due to --exit-stage {0}".format(iter)) + return + current_num_jobs = int(0.5 + args.num_jobs_initial + (args.num_jobs_final - args.num_jobs_initial) * float(iter) / num_iters) + + if args.stage <= iter: + if iter in realign_iters: + logger.info("Re-aligning the data at iteration {0}".format(iter)) + prev_egs_dir=cur_egs_dir + cur_egs_dir="{0}/egs_{1}".format(args.dir, "iter"+str(iter)) + new_ali_dir="{0}/ali_{1}".format(args.dir, "iter"+str(iter)) + Realign(args.dir, iter, args.feat_dir, args.lang, + prev_egs_dir, cur_egs_dir, + args.prior_subset_size, num_archives, run_opts, + transform_dir = args.transform_dir, online_ivector_dir = args.online_ivector_dir) + if args.cleanup and args.egs_dir is None: + RemoveEgs(prev_egs_dir) + model_file = "{dir}/{iter}.mdl".format(dir = args.dir, iter = iter) + + logger.info("On iteration {0}, learning rate is {1}.".format(iter, learning_rate(iter, current_num_jobs, num_archives_processed))) + + TrainOneIteration(args.dir, iter, egs_dir, current_num_jobs, + num_archives_processed, num_archives, + learning_rate(iter, current_num_jobs, num_archives_processed), + args.minibatch_size, args.frames_per_eg, + num_hidden_layers, args.add_layers_period, + left_context, right_context, + args.momentum, args.max_param_change, + args.shuffle_buffer_size, run_opts) + if args.cleanup: + # do a clean up everythin but the last 2 models, under certain conditions + RemoveModel(args.dir, iter-2, num_iters, num_iters_combine, + args.preserve_model_interval) + + if args.email is not None: + reporting_iter_interval = num_iters * args.reporting_interval + if iter % reporting_iter_interval == 0: + # lets do some reporting + [report, times, data] = nnet3_log_parse.GenerateAccuracyReport(args.dir) + message = report + subject = "Update : Expt {dir} : Iter {iter}".format(dir = args.dir, iter = iter) + sendMail(message, subject, args.email) + + num_archives_processed = num_archives_processed + current_num_jobs + + if args.stage <= num_iters: + logger.info("Doing final combination to produce final.mdl") + CombineModels(args.dir, num_iters, num_iters_combine, egs_dir, run_opts) + + if args.stage <= num_iters + 1: + logger.info("Getting average posterior for purposes of adjusting the priors.") + avg_post_vec_file = ComputeAveragePosterior(args.dir, 'combined', egs_dir, + num_archives, args.prior_subset_size, run_opts) + + logger.info("Re-adjusting priors based on computed posteriors") + combined_model = "{dir}/combined.mdl".format(dir = args.dir) + final_model = "{dir}/final.mdl".format(dir = args.dir) + AdjustAmPriors(args.dir, combined_model, avg_post_vec_file, final_model, run_opts) + + if args.cleanup: + logger.info("Cleaning up the experiment directory {0}".format(args.dir)) + remove_egs = args.remove_egs + if args.egs_dir is not None: + # this egs_dir was not created by this experiment so we will not + # delete it + remove_egs = False + + CleanNnetDir(args.dir, num_iters, cur_egs_dir, + preserve_model_interval = args.preserve_model_interval, + remove_egs = remove_egs) + + # do some reporting + [report, times, data] = nnet3_log_parse.GenerateAccuracyReport(args.dir) + if args.email is not None: + SendMail(report, "Update : Expt {0} : complete".format(args.dir), args.email) + + report_handle = open("{dir}/accuracy.report".format(dir = args.dir), "w") + report_handle.write(report) + report_handle.close() + +def Main(): + [args, run_opts] = GetArgs() + try: + Train(args, run_opts) + except Exception as e: + if args.email is not None: + message = "Training session for experiment {dir} died due to an error.".format(dir = args.dir) + sendMail(message, message, args.email) + traceback.print_exc() + raise e + +if __name__ == "__main__": + Main() diff --git a/egs/wsj/s5/steps/nnet3/train_rnn.py b/egs/wsj/s5/steps/nnet3/train_rnn.py new file mode 100755 index 00000000000..dec41409b06 --- /dev/null +++ b/egs/wsj/s5/steps/nnet3/train_rnn.py @@ -0,0 +1,717 @@ +#!/usr/bin/env python + + +# Copyright 2016 Vijayaditya Peddinti. +# Apache 2.0. + + +# this script is based on steps/nnet3/lstm/train.sh + + +import subprocess +import argparse +import sys +import pprint +import logging +import imp +import traceback +from nnet3_train_lib import * + +nnet3_log_parse = imp.load_source('', 'steps/nnet3/report/nnet3_log_parse_lib.py') + +logger = logging.getLogger(__name__) +logger.setLevel(logging.INFO) +handler = logging.StreamHandler() +handler.setLevel(logging.INFO) +formatter = logging.Formatter('%(asctime)s [%(filename)s:%(lineno)s - %(funcName)s - %(levelname)s ] %(message)s') +handler.setFormatter(formatter) +logger.addHandler(handler) +logger.info('Starting RNN trainer (train_rnn.py)') + + +def GetArgs(): + # we add compulsary arguments as named arguments for readability + parser = argparse.ArgumentParser(description=""" + Trains an RNN acoustic model using the cross-entropy objective. + RNNs include LSTMs, BLSTMs and GRUs. + RNN acoustic model training differs from feed-forward DNN training + in the following ways + 1. RNN acoustic models train on output chunks rather than individual + outputs + 2. The training includes additional stage of shrinkage, where + the parameters of the model are scaled when the derivative averages + at the non-linearities are below a threshold. + 3. RNNs can also be trained with state preservation training + """, + formatter_class=argparse.ArgumentDefaultsHelpFormatter) + + # feat options + parser.add_argument("--feat.online-ivector-dir", type=str, dest='online_ivector_dir', + default = None, action = NullstrToNoneAction, + help="""directory with the ivectors extracted in + an online fashion.""") + parser.add_argument("--feat.cmvn-opts", type=str, dest='cmvn_opts', + default = None, action = NullstrToNoneAction, + help="A string specifying '--norm-means' and '--norm-vars' values") + + # egs extraction options + parser.add_argument("--egs.chunk-width", type=int, dest='chunk_width', + default = 20, + help="""Number of output labels in the sequence + used to train an LSTM. + Caution: if you double this you should halve + --trainer.samples-per-iter.""") + parser.add_argument("--egs.chunk-left-context", type=int, dest='chunk_left_context', + default = 40, + help="""Number of left steps used in the estimation of LSTM + state before prediction of the first label""") + parser.add_argument("--egs.chunk-right-context", type=int, dest='chunk_right_context', + default = 0, + help="""Number of right steps used in the estimation of BLSTM + state before prediction of the first label""") + parser.add_argument("--egs.transform_dir", type=str, dest='transform_dir', + default = None, action = NullstrToNoneAction, + help="""String to provide options directly to steps/nnet3/get_egs.sh script""") + parser.add_argument("--egs.dir", type=str, dest='egs_dir', + default = None, action = NullstrToNoneAction, + help="""Directory with egs. If specified this directory + will be used rather than extracting egs""") + parser.add_argument("--egs.stage", type=int, dest='egs_stage', + default = 0, help="Stage at which get_egs.sh should be restarted") + parser.add_argument("--egs.opts", type=str, dest='egs_opts', + default = None, action = NullstrToNoneAction, + help="""String to provide options directly to steps/nnet3/get_egs.sh script""") + + # trainer options + parser.add_argument("--trainer.num-epochs", type=int, dest='num_epochs', + default = 8, + help="Number of epochs to train the model") + parser.add_argument("--trainer.prior-subset-size", type=int, dest='prior_subset_size', + default = 20000, + help="Number of samples for computing priors") + parser.add_argument("--trainer.num-jobs-compute-prior", type=int, dest='num_jobs_compute_prior', + default = 10, + help="The prior computation jobs are single threaded and run on the CPU") + parser.add_argument("--trainer.max-models-combine", type=int, dest='max_models_combine', + default = 20, + help="The maximum number of models used in the final model combination stage. These models will themselves be averages of iteration-number ranges") + parser.add_argument("--trainer.shuffle-buffer-size", type=int, dest='shuffle_buffer_size', + default = 5000, + help=""" Controls randomization of the samples on each + iteration. If 0 or a large value the randomization is + complete, but this will consume memory and cause spikes + in disk I/O. Smaller is easier on disk and memory but + less random. It's not a huge deal though, as samples + are anyway randomized right at the start. + (the point of this is to get data in different + minibatches on different iterations, since in the + preconditioning method, 2 samples in the same minibatch + can affect each others' gradients.""") + parser.add_argument("--trainer.add-layers-period", type=int, dest='add_layers_period', + default=2, + help="The number of iterations between adding layers during layer-wise discriminative training.") + parser.add_argument("--trainer.max-param-change", type=float, dest='max_param_change', + default=2.0, + help="""The maximum change in parameters allowed + per minibatch, measured in Frobenius norm over + the entire model""") + parser.add_argument("--trainer.samples-per-iter", type=int, dest='samples_per_iter', + default=20000, + help="""This is really the number of egs in each + archive. Each eg has 'chunk_width' frames in it-- + for chunk_width=20, this value (20k) is equivalent + to the 400k number that we use as a default in + regular DNN training.""") + parser.add_argument("--trainer.lda.rand-prune", type=float, dest='rand_prune', + default=4.0, + help="""Value used in preconditioning matrix estimation""") + parser.add_argument("--trainer.lda.max-lda-jobs", type=float, dest='max_lda_jobs', + default=10, + help="""Max number of jobs used for LDA stats accumulation""") + + # Realignment parameters + parser.add_argument("--trainer.realign.command", type=str, dest='realign_command', + default=None, action=NullstrToNoneAction, + help="""Command to be used with steps/nnet3/align.sh during realignment""") + parser.add_argument("--trainer.realign.num-jobs", type=int, dest='realign_num_jobs', + default=30, + help="Number of jobs to use for realignment") + parser.add_argument("--trainer.realign.times", type=str, dest='realign_times', + default=None, action=NullstrToNoneAction, + help="""A space seperated string of realignment + times. Values must be between 0 and 1 + e.g. '0.1 0.2 0.3' """) + + parser.add_argument("--trainer.realign.use_gpu", type=str, dest='realign_use_gpu', + default=True, action=StrToBoolAction, + choices = ["true", "false"], + help="If true, gpu is used with steps/nnet3/align.sh") + + # Parameters for the optimization + parser.add_argument("--trainer.optimization.initial-effective-lrate", type=float, dest='initial_effective_lrate', + default = 0.0003, + help="Learning rate used during the initial iteration") + parser.add_argument("--trainer.optimization.final-effective-lrate", type=float, dest='final_effective_lrate', + default = 0.00003, + help="Learning rate used during the final iteration") + parser.add_argument("--trainer.optimization.num-jobs-initial", type=int, dest='num_jobs_initial', + default = 1, + help="Number of neural net jobs to run in parallel at the start of training") + parser.add_argument("--trainer.optimization.num-jobs-final", type=int, dest='num_jobs_final', + default = 8, + help="Number of neural net jobs to run in parallel at the end of training") + parser.add_argument("--trainer.optimization.max-models-combine", type=int, dest='max_models_combine', + default = 20, + help = """ The is the maximum number of models we give to the + final 'combine' stage, but these models will themselves + be averages of iteration-number ranges. """) + parser.add_argument("--trainer.optimization.momentum", type=float, dest='momentum', + default = 0.5, + help="""Momentum used in update computation. + Note: we implemented it in such a way that + it doesn't increase the effective learning rate.""") + parser.add_argument("--trainer.optimization.shrink-value", type=float, dest='shrink_value', + default = 0.99, + help="Scaling factor used for scaling the parameter matrices when the derivative averages are below the shrink-threshold at the non-linearities") + parser.add_argument("--trainer.optimization.shrink-threshold", type=float, dest='shrink_threshold', + default = 0.15, + help="If the derivative averages are below this threshold we scale the parameter matrices with the shrink-value. It is less than 0.25 for sigmoid non-linearities.") + + # RNN specific trainer options + parser.add_argument("--trainer.rnn.num-chunk-per-minibatch", type=int, dest='num_chunk_per_minibatch', + default=100, + help="Number of sequences to be processed in parallel every minibatch" ) + parser.add_argument("--trainer.rnn.num-bptt-steps", type=int, dest='num_bptt_steps', + default=None, + help="The number of time steps to back-propagate from the last label in the chunk. By default it is same as the chunk-width." ) + + # General options + parser.add_argument("--stage", type=int, default=-4, + help="Specifies the stage of the experiment to execution from") + parser.add_argument("--exit-stage", type=int, default=None, + help="If specified, training exits before running this stage") + parser.add_argument("--cmd", type=str, action = NullstrToNoneAction, + dest = "command", + help="""Specifies the script to launch jobs. + e.g. queue.pl for launching on SGE cluster + run.pl for launching on local machine + """, default = "queue.pl") + parser.add_argument("--use-gpu", type=str, action = StrToBoolAction, + choices = ["true", "false"], + help="Use GPU for training", default=True) + parser.add_argument("--cleanup", type=str, action = StrToBoolAction, + choices = ["true", "false"], + help="Clean up models after training", default=True) + parser.add_argument("--cleanup.remove-egs", type=str, dest='remove_egs', + default = True, action = StrToBoolAction, + choices = ["true", "false"], + help="""If true, remove egs after experiment""") + parser.add_argument("--cleanup.preserve-model-interval", dest = "preserve_model_interval", + type=int, default=100, + help="Determines iterations for which models will be preserved during cleanup. If iter % preserve_model_interval == 0 model will be preserved.") + + parser.add_argument("--reporting.email", dest = "email", + type=str, default=None, action = NullstrToNoneAction, + help=""" Email-id to report about the progress of the experiment. + NOTE: It assumes the machine on which the script is being run can send + emails from command line via. mail program. The + Kaldi mailing list will not support this feature. + It might require local expertise to setup. """) + parser.add_argument("--reporting.interval", dest = "reporting_interval", + type=int, default=0.1, + help="Frequency with which reports have to be sent, measured in terms of fraction of iterations. If 0 and reporting mail has been specified then only failure notifications are sent") + + parser.add_argument("--feat-dir", type=str, required = True, + help="Directory with features used for training the neural network.") + parser.add_argument("--lang", type=str, required = True, + help="Languade directory") + parser.add_argument("--ali-dir", type=str, required = True, + help="Directory with alignments used for training the neural network.") + parser.add_argument("--dir", type=str, required = True, + help="Directory to store the models and all other files.") + + print(' '.join(sys.argv)) + + args = parser.parse_args() + + [args, run_opts] = ProcessArgs(args) + + return [args, run_opts] + +def ProcessArgs(args): + # process the options + if args.chunk_width < 1: + raise Exception("--egs.chunk-width should have a minimum value of 1") + + if args.chunk_left_context < 0: + raise Exception("--egs.chunk-left-context should be positive") + + if args.chunk_right_context < 0: + raise Exception("--egs.chunk-right-context should be positive") + + if (not os.path.exists(args.dir)) or (not os.path.exists(args.dir+"/configs")): + raise Exception("""This scripts expects {0} to exist and have a configs + directory which is the output of make_configs.py script""") + + if args.transform_dir is None: + args.transform_dir = args.ali_dir + # set the options corresponding to args.use_gpu + run_opts = RunOpts() + if args.use_gpu: + if not CheckIfCudaCompiled(): + logger.warning(""" + You are running with one thread but you have not compiled + for CUDA. You may be running a setup optimized for GPUs. If you have + GPUs and have nvcc installed, go to src/ and do ./configure; make""") + + run_opts.train_queue_opt = "--gpu 1" + run_opts.parallel_train_opts = "" + run_opts.combine_queue_opt = "--gpu 1" + run_opts.prior_gpu_opt = "--use-gpu=yes" + run_opts.prior_queue_opt = "--gpu 1" + + else: + logger.warning(""" + Without using a GPU this will be very slow. nnet3 does not yet support multiple threads.""") + + run_opts.train_queue_opt = "" + run_opts.parallel_train_opts = "--use-gpu=no" + run_opts.combine_queue_opt = "" + run_opts.prior_gpu_opt = "--use-gpu=no" + run_opts.prior_queue_opt = "" + + if args.realign_use_gpu is True: + run_opts.realign_use_gpu = True + run_opts.realign_queue_opt = "--gpu 1" + else: + run_opts.realign_use_gpu = False + run_opts.realign_queue_opt = "" + + if args.realign_command is None: + run_opts.realign_command = args.command + else: + run_opts.realign_command = args.realign_command + run_opts.realign_num_jobs = args.realign_num_jobs + + run_opts.command = args.command + run_opts.num_jobs_compute_prior = args.num_jobs_compute_prior + + return [args, run_opts] + +class StrToBoolAction(argparse.Action): + """ A custom action to convert bools from shell format i.e., true/false + to python format i.e., True/False """ + def __call__(self, parser, namespace, values, option_string=None): + if values == "true": + setattr(namespace, self.dest, True) + elif values == "false": + setattr(namespace, self.dest, False) + else: + raise Exception("Unknown value {0} for --{1}".format(values, self.dest)) + +class NullstrToNoneAction(argparse.Action): + """ A custom action to convert empty strings passed by shell + to None in python. This is necessary as shell scripts print null strings + when a variable is not specified. We could use the more apt None + in python. """ + def __call__(self, parser, namespace, values, option_string=None): + if values.strip() == "": + setattr(namespace, self.dest, None) + else: + setattr(namespace, self.dest, values) + + +# a class to store run options +class RunOpts: + def __init__(self): + self.command = None + self.train_queue_opt = None + self.combine_queue_opt = None + self.prior_gpu_opt = None + self.prior_queue_opt = None + self.parallel_train_opts = None + self.realign_use_gpu = None + + +def TrainNewModels(dir, iter, num_jobs, num_archives_processed, num_archives, + raw_model_string, egs_dir, + left_context, right_context, min_deriv_time, + momentum, max_param_change, + shuffle_buffer_size, num_chunk_per_minibatch, + cache_read_opt, run_opts): + # We cannot easily use a single parallel SGE job to do the main training, + # because the computation of which archive and which --frame option + # to use for each job is a little complex, so we spawn each one separately. + # this is no longer true for RNNs as we use do not use the --frame option + # but we use the same script for consistency with FF-DNN code + + context_opts="--left-context={0} --right-context={1}".format( + left_context, right_context) + processes = [] + for job in range(1,num_jobs+1): + k = num_archives_processed + job - 1 # k is a zero-based index that we will derive + # the other indexes from. + archive_index = (k % num_archives) + 1 # work out the 1-based archive index. + + cache_write_opt = "" + if job == 1: + # an option for writing cache (storing pairs of nnet-computations and + # computation-requests) during training. + cache_write_opt="--write-cache={dir}/cache.{iter}".format(dir=dir, iter=iter+1) + + process_handle = RunKaldiCommand(""" +{command} {train_queue_opt} {dir}/log/train.{iter}.{job}.log \ + nnet3-train {parallel_train_opts} {cache_read_opt} {cache_write_opt} \ + --print-interval=10 --momentum={momentum} \ + --max-param-change={max_param_change} \ + --optimization.min-deriv-time={min_deriv_time} "{raw_model}" \ + "ark,bg:nnet3-copy-egs {context_opts} ark:{egs_dir}/egs.{archive_index}.ark ark:- | nnet3-shuffle-egs --buffer-size={shuffle_buffer_size} --srand={iter} ark:- ark:-| nnet3-merge-egs --minibatch-size={num_chunk_per_minibatch} --measure-output-frames=false --discard-partial-minibatches=true ark:- ark:- |" \ + {dir}/{next_iter}.{job}.raw + """.format(command = run_opts.command, + train_queue_opt = run_opts.train_queue_opt, + dir = dir, iter = iter, next_iter = iter + 1, job = job, + parallel_train_opts = run_opts.parallel_train_opts, + cache_read_opt = cache_read_opt, cache_write_opt = cache_write_opt, + momentum = momentum, max_param_change = max_param_change, + min_deriv_time = min_deriv_time, + raw_model = raw_model_string, context_opts = context_opts, + egs_dir = egs_dir, archive_index = archive_index, + shuffle_buffer_size = shuffle_buffer_size, + num_chunk_per_minibatch = num_chunk_per_minibatch), + wait = False) + + processes.append(process_handle) + + all_success = True + for process in processes: + process.wait() + [stdout_value, stderr_value] = process.communicate() + print(stderr_value) + if process.returncode != 0: + all_success = False + + if not all_success: + open('{0}/.error'.format(dir), 'w').close() + raise Exception("There was error during training iteration {0}".format(iter)) + +def TrainOneIteration(dir, iter, egs_dir, + num_jobs, num_archives_processed, num_archives, + learning_rate, shrinkage_value, num_chunk_per_minibatch, + num_hidden_layers, add_layers_period, + left_context, right_context, min_deriv_time, + momentum, max_param_change, shuffle_buffer_size, + run_opts): + # Set off jobs doing some diagnostics, in the background. + # Use the egs dir from the previous iteration for the diagnostics + logger.info("Training neural net (pass {0})".format(iter)) + + ComputeTrainCvProbabilities(dir, iter, egs_dir, run_opts) + + if iter > 0: + ComputeProgress(dir, iter, egs_dir, run_opts) + + # an option for writing cache (storing pairs of nnet-computations + # and computation-requests) during training. + cache_read_opt = "" + if iter > 0 and (iter <= (num_hidden_layers-1) * add_layers_period) and (iter % add_layers_period == 0): + do_average = False # if we've just mixed up, don't do averaging but take the + # best. + cur_num_hidden_layers = 1 + iter / add_layers_period + config_file = "{0}/configs/layer{1}.config".format(dir, cur_num_hidden_layers) + raw_model_string = "nnet3-am-copy --raw=true --learning-rate={lr} {dir}/{iter}.mdl - | nnet3-init --srand={iter} - {config} - |".format(lr=learning_rate, dir=dir, iter=iter, config=config_file) + else: + do_average = True + if iter == 0: + do_average = False # on iteration 0, pick the best, don't average. + else: + cache_read_opt = "--read-cache={dir}/cache.{iter}".format(dir=dir, iter=iter) + raw_model_string = "nnet3-am-copy --raw=true --learning-rate={0} {1}/{2}.mdl - |".format(learning_rate, dir, iter) + + if do_average: + cur_num_chunk_per_minibatch = num_chunk_per_minibatch + else: + # on iteration zero or when we just added a layer, use a smaller minibatch + # size (and we will later choose the output of just one of the jobs): the + # model-averaging isn't always helpful when the model is changing too fast + # (i.e. it can worsen the objective function), and the smaller minibatch + # size will help to keep the update stable. + cur_num_chunk_per_minibatch = num_chunk_per_minibatch / 2 + + try: + os.remove("{0}/.error".format(dir)) + except OSError: + pass + + TrainNewModels(dir, iter, num_jobs, num_archives_processed, num_archives, + raw_model_string, egs_dir, + left_context, right_context, min_deriv_time, + momentum, max_param_change, + shuffle_buffer_size, cur_num_chunk_per_minibatch, + cache_read_opt, run_opts) + [models_to_average, best_model] = GetSuccessfulModels(num_jobs, '{0}/log/train.{1}.%.log'.format(dir,iter)) + nnets_list = [] + for n in models_to_average: + nnets_list.append("{0}/{1}.{2}.raw".format(dir, iter + 1, n)) + + if do_average: + # average the output of the different jobs. + RunKaldiCommand(""" +{command} {dir}/log/average.{iter}.log \ +nnet3-average {nnet_list} - \| \ +nnet3-am-copy --scale={shrink} --set-raw-nnet=- {dir}/{iter}.mdl {dir}/{new_iter}.mdl + """.format(command = run_opts.command, + dir = dir, + iter = iter, + nnet_list = " ".join(nnets_list), + shrink = shrinkage_value, + new_iter = iter + 1)) + + else: + # choose the best model from different jobs + RunKaldiCommand(""" +{command} {dir}/log/select.{iter}.log \ + nnet3-am-copy --scale={shrink} --set-raw-nnet={dir}/{next_iter}.{best_model_index}.raw {dir}/{iter}.mdl {dir}/{next_iter}.mdl + """.format(command = run_opts.command, + dir = dir, iter = iter, next_iter = iter + 1, + shrink = shrinkage_value, best_model_index = best_model)) + + try: + for i in range(1, num_jobs + 1): + os.remove("{0}/{1}.{2}.raw".format(dir, iter + 1, i)) + except OSError: + raise Exception("Error while trying to delete the raw models") + + new_model = "{0}/{1}.mdl".format(dir, iter + 1) + + if not os.path.isfile(new_model): + raise Exception("Could not find {0}, at the end of iteration {1}".format(new_model, iter)) + elif os.stat(new_model).st_size == 0: + raise Exception("{0} has size 0. Something went wrong in iteration {1}".format(new_model, iter)) + if cache_read_opt and os.path.exists("{0}/cache.{1}".format(dir, iter)): + os.remove("{0}/cache.{1}".format(dir, iter)) + + +# args is a Namespace with the required parameters +def Train(args, run_opts): + arg_string = pprint.pformat(vars(args)) + logger.info("Arguments for the experiment\n{0}".format(arg_string)) + + # Set some variables. + num_leaves = GetNumberOfLeaves(args.ali_dir) + num_jobs = GetNumberOfJobs(args.ali_dir) + feat_dim = GetFeatDim(args.feat_dir) + ivector_dim = GetIvectorDim(args.online_ivector_dir) + + # split the training data into parts for individual jobs + # we will use the same number of jobs as that used for alignment + SplitData(args.feat_dir, num_jobs) + shutil.copy('{0}/tree'.format(args.ali_dir), args.dir) + f = open('{0}/num_jobs'.format(args.dir), 'w') + f.write(str(num_jobs)) + f.close() + + config_dir = '{0}/configs'.format(args.dir) + var_file = '{0}/vars'.format(config_dir) + + [model_left_context, model_right_context, num_hidden_layers] = ParseModelConfigVarsFile(var_file) + # Initialize as "raw" nnet, prior to training the LDA-like preconditioning + # matrix. This first config just does any initial splicing that we do; + # we do this as it's a convenient way to get the stats for the 'lda-like' + # transform. + + if (args.stage <= -4): + logger.info("Initializing a basic network for estimating preconditioning matrix") + RunKaldiCommand(""" +{command} {dir}/log/nnet_init.log \ + nnet3-init --srand=-2 {dir}/configs/init.config {dir}/init.raw + """.format(command = run_opts.command, + dir = args.dir)) + + left_context = args.chunk_left_context + model_left_context + right_context = args.chunk_right_context + model_right_context + + default_egs_dir = '{0}/egs'.format(args.dir) + if (args.stage <= -3) and args.egs_dir is None: + logger.info("Generating egs") + + GenerateEgs(args.feat_dir, args.ali_dir, default_egs_dir, + left_context, right_context, + args.chunk_width + left_context, + args.chunk_width + right_context, run_opts, + frames_per_eg = args.chunk_width, + egs_opts = args.egs_opts, + cmvn_opts = args.cmvn_opts, + online_ivector_dir = args.online_ivector_dir, + samples_per_iter = args.samples_per_iter, + transform_dir = args.transform_dir, + stage = args.egs_stage) + + if args.egs_dir is None: + egs_dir = default_egs_dir + else: + egs_dir = args.egs_dir + + [egs_left_context, egs_right_context, frames_per_eg, num_archives] = VerifyEgsDir(egs_dir, feat_dim, ivector_dim, left_context, right_context) + assert(args.chunk_width == frames_per_eg) + + if (args.num_jobs_final > num_archives): + raise Exception('num_jobs_final cannot exceed the number of archives in the egs directory') + + # copy the properties of the egs to dir for + # use during decoding + CopyEgsPropertiesToExpDir(egs_dir, args.dir) + + if (args.stage <= -2): + logger.info('Computing the preconditioning matrix for input features') + + ComputePreconditioningMatrix(args.dir, egs_dir, num_archives, run_opts, + max_lda_jobs = args.max_lda_jobs, + rand_prune = args.rand_prune) + + if (args.stage <= -1): + logger.info("Preparing the initial acoustic model.") + PrepareInitialAcousticModel(args.dir, args.ali_dir, run_opts) + + + # set num_iters so that as close as possible, we process the data $num_epochs + # times, i.e. $num_iters*$avg_num_jobs) == $num_epochs*$num_archives, + # where avg_num_jobs=(num_jobs_initial+num_jobs_final)/2. + num_archives_to_process = args.num_epochs * num_archives + num_archives_processed = 0 + num_iters=(num_archives_to_process * 2) / (args.num_jobs_initial + args.num_jobs_final) + + num_iters_combine = VerifyIterations(num_iters, args.num_epochs, + num_hidden_layers, num_archives, + args.max_models_combine, args.add_layers_period, + args.num_jobs_final) + + learning_rate = lambda iter, current_num_jobs, num_archives_processed: GetLearningRate(iter, current_num_jobs, num_iters, + num_archives_processed, + num_archives_to_process, + args.initial_effective_lrate, + args.final_effective_lrate) + realign_iters = [] + if args.realign_times is not None: + realign_iters = GetRealignIters(args.realign_times, + num_iters, + args.num_jobs_initial, + args.num_jobs_final) + print(realign_iters) + # egs_dir will be updated if there is realignment + cur_egs_dir=egs_dir + + if args.num_bptt_steps is None: + num_bptt_steps = args.chunk_width + else: + num_bptt_steps = args.num_bptt_steps + + min_deriv_time = args.chunk_width - num_bptt_steps + + + logger.info("Training will run for {0} epochs = {1} iterations".format(args.num_epochs, num_iters)) + for iter in range(num_iters): + if (args.exit_stage is not None) and (iter == args.exit_stage): + logger.info("Exiting early due to --exit-stage {0}".format(iter)) + return + current_num_jobs = int(0.5 + args.num_jobs_initial + (args.num_jobs_final - args.num_jobs_initial) * float(iter) / num_iters) + + if args.stage <= iter: + if iter in realign_iters: + logger.info("Re-aligning the data at iteration {0}".format(iter)) + prev_egs_dir=cur_egs_dir + cur_egs_dir="{0}/egs_{1}".format(args.dir, "iter"+str(iter)) + new_ali_dir="{0}/ali_{1}".format(args.dir, "iter"+str(iter)) + Realign(args.dir, iter, args.feat_dir, args.lang, + prev_egs_dir, cur_egs_dir, + args.prior_subset_size, num_archives, run_opts, + transform_dir = args.transform_dir, online_ivector_dir = args.online_ivector_dir) + if args.cleanup and args.egs_dir is None: + RemoveEgs(prev_egs_dir) + model_file = "{dir}/{iter}.mdl".format(dir = args.dir, iter = iter) + shrinkage_value = args.shrink_value if DoShrinkage(iter, model_file, "SigmoidComponent", args.shrink_threshold) else 1 + logger.info("On iteration {0}, learning rate is {1} and shrink value is {2}.".format(iter, learning_rate(iter, current_num_jobs, num_archives_processed), shrinkage_value)) + + TrainOneIteration(args.dir, iter, egs_dir, current_num_jobs, + num_archives_processed, num_archives, + learning_rate(iter, current_num_jobs, num_archives_processed), + shrinkage_value, + args.num_chunk_per_minibatch, + num_hidden_layers, args.add_layers_period, + left_context, right_context, min_deriv_time, + args.momentum, args.max_param_change, + args.shuffle_buffer_size, run_opts) + if args.cleanup: + # do a clean up everythin but the last 2 models, under certain conditions + RemoveModel(args.dir, iter-2, num_iters, num_iters_combine, + args.preserve_model_interval) + + if args.email is not None: + reporting_iter_interval = num_iters * args.reporting_interval + if iter % reporting_iter_interval == 0: + # lets do some reporting + [report, times, data] = nnet3_log_parse.GenerateAccuracyReport(args.dir) + message = report + subject = "Update : Expt {dir} : Iter {iter}".format(dir = args.dir, iter = iter) + sendMail(message, subject, args.email) + + num_archives_processed = num_archives_processed + current_num_jobs + + if args.stage <= num_iters: + logger.info("Doing final combination to produce final.mdl") + CombineModels(args.dir, num_iters, num_iters_combine, egs_dir, run_opts, + chunk_width = args.chunk_width) + + if args.stage <= num_iters + 1: + logger.info("Getting average posterior for purposes of adjusting the priors.") + avg_post_vec_file = ComputeAveragePosterior(args.dir, 'combined', egs_dir, + num_archives, args.prior_subset_size, run_opts) + + logger.info("Re-adjusting priors based on computed posteriors") + combined_model = "{dir}/combined.mdl".format(dir = args.dir) + final_model = "{dir}/final.mdl".format(dir = args.dir) + AdjustAmPriors(args.dir, combined_model, avg_post_vec_file, final_model, run_opts) + + if args.cleanup: + logger.info("Cleaning up the experiment directory {0}".format(args.dir)) + remove_egs = args.remove_egs + if args.egs_dir is not None: + # this egs_dir was not created by this experiment so we will not + # delete it + remove_egs = False + + CleanNnetDir(args.dir, num_iters, cur_egs_dir, + preserve_model_interval = args.preserve_model_interval, + remove_egs = remove_egs) + + # do some reporting + [report, times, data] = nnet3_log_parse.GenerateAccuracyReport(args.dir) + if args.email is not None: + sendMail(report, "Update : Expt {0} : complete".format(args.dir), args.email) + + report_handle = open("{dir}/accuracy.report".format(dir = args.dir), "w") + report_handle.write(report) + report_handle.close() + +def Main(): + [args, run_opts] = GetArgs() + try: + Train(args, run_opts) + except Exception as e: + if args.email is not None: + message = "Training session for experiment {dir} died due to an error.".format(dir = args.dir) + sendMail(message, message, args.email) + traceback.print_exc() + raise e + +def SendMail(message, subject, email_id): + try: + subprocess.Popen('echo "{message}" | mail -s "{subject}" {email} '.format( + message = message, + subject = subject, + email = email_id), shell=True) + except Exception as e: + logger.info(" Unable to send mail due to error:\n {error}".format(error = str(e))) + pass + +if __name__ == "__main__": + Main() diff --git a/egs/wsj/s5/steps/nnet3/train_tdnn.sh b/egs/wsj/s5/steps/nnet3/train_tdnn.sh index 842ce7e9c94..99122fedd73 100755 --- a/egs/wsj/s5/steps/nnet3/train_tdnn.sh +++ b/egs/wsj/s5/steps/nnet3/train_tdnn.sh @@ -2,7 +2,7 @@ # note, TDNN is the same as what we used to call multisplice. -# Copyright 2012-2015 Johns Hopkins University (Author: Daniel Povey). +# Copyright 2012-2015 Johns Hopkins University (Author: Daniel Povey). # 2013 Xiaohui Zhang # 2013 Guoguo Chen # 2014 Vimal Manohar @@ -16,22 +16,23 @@ num_epochs=15 # Number of epochs of training; # the number of iterations is worked out from this. initial_effective_lrate=0.01 final_effective_lrate=0.001 -pnorm_input_dim=3000 +pnorm_input_dim=3000 pnorm_output_dim=300 relu_dim= # you can use this to make it use ReLU's instead of p-norms. rand_prune=4.0 # Relates to a speedup we do for LDA. minibatch_size=512 # This default is suitable for GPU-based training. # Set it to 128 for multi-threaded CPU-based training. - +max_param_change=2.0 # max param change per minibatch samples_per_iter=400000 # each iteration of training, see this many samples # per job. This option is passed to get_egs.sh num_jobs_initial=1 # Number of neural net jobs to run in parallel at the start of training num_jobs_final=8 # Number of neural net jobs to run in parallel at the end of training -prior_subset_size=20000 # 20k samples per job, for computing priors. +prior_subset_size=20000 # 20k samples per job, for computing priors. num_jobs_compute_prior=10 # these are single-threaded, run on CPU. get_egs_stage=0 # can be used for rerunning after partial online_ivector_dir= presoftmax_prior_scale_power=-0.25 +use_presoftmax_prior_scale=true remove_egs=true # set to false to disable removing egs after training is done. max_models_combine=20 # The "max_models_combine" is the maximum number of models we give @@ -57,25 +58,20 @@ splice_indexes="-4,-3,-2,-1,0,1,2,3,4 0 -2,2 0 -4,4 0" # note: hidden layers which are composed of one or more components, # so hidden layer indexing is different from component count - -io_opts="-tc 5" # for jobs with a lot of I/O, limits the number running at one time. These don't randprune=4.0 # speeds up LDA. -affine_opts= - use_gpu=true # if true, we run on GPU. -num_threads=16 # if using CPU, the number of threads we use. cleanup=true egs_dir= max_lda_jobs=10 # use no more than 10 jobs for the LDA accumulation. lda_opts= egs_opts= transform_dir= # If supplied, this dir used instead of alidir to find transforms. -cmvn_opts= # will be passed to get_lda.sh and get_egs.sh, if supplied. +cmvn_opts= # will be passed to get_lda.sh and get_egs.sh, if supplied. # only relevant for "raw" features, not lda. feat_type=raw # or set to 'lda' to use LDA features. align_cmd= # The cmd that is passed to steps/nnet2/align.sh align_use_gpu= # Passed to use_gpu in steps/nnet2/align.sh [yes/no] -realign_times= # List of times on which we realign. Each time is +realign_times= # List of times on which we realign. Each time is # floating point number strictly between 0 and 1, which # will be multiplied by the num-iters to get an iteration # number. @@ -113,7 +109,6 @@ if [ $# != 4 ]; then echo " --parallel-opts # extra options to pass to e.g. queue.pl for processes that" echo " # use multiple threads... note, you might have to reduce mem_free,ram_free" echo " # versus your defaults, because it gets multiplied by the -pe smp argument." - echo " --io-opts # Options given to e.g. queue.pl for jobs that do a lot of I/O." echo " --minibatch-size # Size of minibatch to process (note: product with --num-threads" echo " # should not get too large, e.g. >2k)." echo " --samples-per-iter <#samples|400000> # Number of samples of data to process per iteration, per" @@ -123,15 +118,15 @@ if [ $# != 4 ]; then echo " # Format : layer/....layer/ " echo " # (note: we splice processed, typically 40-dimensional frames" echo " --lda-dim # Dimension to reduce spliced features to with LDA" - echo " --realign-epochs # A list of space-separated epoch indices the beginning of which" - echo " # realignment is to be done" + echo " --realign-times # A list of space-separated floating point numbers between 0.0 and" + echo " # 1.0 to specify how far through training realignment is to be done" echo " --align-cmd (utils/run.pl|utils/queue.pl ) # passed to align.sh" echo " --align-use-gpu (yes/no) # specify is gpu is to be used for realignment" echo " --num-jobs-align <#njobs|30> # Number of jobs to perform realignment" echo " --stage # Used to run a partially-completed training process from somewhere in" echo " # the middle." - + exit 1; fi @@ -193,13 +188,14 @@ if [ $stage -le -5 ]; then else dim_opts="--pnorm-input-dim $pnorm_input_dim --pnorm-output-dim $pnorm_output_dim" fi - + # create the config files for nnet initialization python steps/nnet3/make_tdnn_configs.py \ --splice-indexes "$splice_indexes" \ --feat-dim $feat_dim \ --ivector-dim $ivector_dim \ $dim_opts \ + --use-presoftmax-prior-scale $use_presoftmax_prior_scale \ --num-targets $num_leaves \ $dir/configs || exit 1; @@ -236,23 +232,22 @@ if [ $stage -le -4 ] && [ -z "$egs_dir" ]; then echo "$0: calling get_egs.sh" steps/nnet3/get_egs.sh $egs_opts "${extra_opts[@]}" \ --samples-per-iter $samples_per_iter --stage $get_egs_stage \ - --io-opts "$io_opts" \ --cmd "$cmd" $egs_opts \ --frames-per-eg $frames_per_eg \ $data $alidir $dir/egs || exit 1; fi -if [ "$feat_dim" != "$(cat $dir/egs/info/feat_dim)" ]; then - echo "$0: feature dimension mismatch with egs, $feat_dim vs $(cat $dir/egs/info/feat_dim)"; +[ -z $egs_dir ] && egs_dir=$dir/egs + +if [ "$feat_dim" != "$(cat $egs_dir/info/feat_dim)" ]; then + echo "$0: feature dimension mismatch with egs, $feat_dim vs $(cat $egs_dir/info/feat_dim)"; exit 1; fi -if [ "$ivector_dim" != "$(cat $dir/egs/info/ivector_dim)" ]; then - echo "$0: ivector dimension mismatch with egs, $ivector_dim vs $(cat $dir/egs/info/ivector_dim)"; +if [ "$ivector_dim" != "$(cat $egs_dir/info/ivector_dim)" ]; then + echo "$0: ivector dimension mismatch with egs, $ivector_dim vs $(cat $egs_dir/info/ivector_dim)"; exit 1; fi -[ -z $egs_dir ] && egs_dir=$dir/egs - # copy any of the following that exist, to $dir. cp $egs_dir/{cmvn_opts,splice_opts,final.mat} $dir 2>/dev/null @@ -260,8 +255,8 @@ cp $egs_dir/{cmvn_opts,splice_opts,final.mat} $dir 2>/dev/null # the --egs-dir option was used on the command line). egs_left_context=$(cat $egs_dir/info/left_context) || exit -1 egs_right_context=$(cat $egs_dir/info/right_context) || exit -1 -( ! [ $(cat $egs_dir/info/left_context) -le $left_context ] || - ! [ $(cat $egs_dir/info/right_context) -le $right_context ] ) && \ + ( [ $egs_left_context -lt $left_context ] || \ + [ $egs_right_context -lt $right_context ] ) && \ echo "$0: egs in $egs_dir have too little context" && exit -1; frames_per_eg=$(cat $egs_dir/info/frames_per_eg) || { echo "error: no such file $egs_dir/info/frames_per_eg"; exit 1; } @@ -308,14 +303,14 @@ if [ $stage -le -2 ]; then echo "$0: preparing initial vector for FixedScaleComponent before softmax" echo " ... using priors^$presoftmax_prior_scale_power and rescaling to average 1" - # obtains raw pdf count + # obtains raw pdf count $cmd JOB=1:$nj $dir/log/acc_pdf.JOB.log \ ali-to-post "ark:gunzip -c $alidir/ali.JOB.gz|" ark:- \| \ post-to-tacc --per-pdf=true $alidir/final.mdl ark:- $dir/pdf_counts.JOB || exit 1; $cmd $dir/log/sum_pdf_counts.log \ vector-sum --binary=false $dir/pdf_counts.* $dir/pdf_counts || exit 1; rm $dir/pdf_counts.* - + awk -v power=$presoftmax_prior_scale_power -v smooth=0.01 \ '{ for(i=2; i<=NF-1; i++) { count[i-2] = $i; total += $i; } num_pdfs=NF-2; average_count = total/num_pdfs; @@ -367,16 +362,11 @@ if $use_gpu; then exit 1 fi else - if [ $num_threads -gt 1 ]; then - parallel_suffix="-parallel" - parallel_train_opts="--num-threads=$num_threads" - train_queue_opt="--num-threads $num_threads" - combine_queue_opt="" # the combine stage will be quite slow if not using - # GPU, as we didn't enable that program to use - # multiple threads. - else - parallel_suffix="" - fi + echo "$0: without using a GPU this will be very slow. nnet3 does not yet support multiple threads." + parallel_train_opts="--use-gpu=no" + combine_queue_opt="" # the combine stage will be quite slow if not using + # GPU, as we didn't enable that program to use + # multiple threads. prior_gpu_opt="--use-gpu=no" prior_queue_opt="" fi @@ -420,7 +410,7 @@ while [ $x -lt $num_iters ]; do ilr=$initial_effective_lrate; flr=$final_effective_lrate; np=$num_archives_processed; nt=$num_archives_to_process; this_learning_rate=$(perl -e "print (($x + 1 >= $num_iters ? $flr : $ilr*exp($np*log($flr/$ilr)/$nt))*$this_num_jobs);"); - echo "On iteration $x, learning rate is $this_learning_rate." + echo "On iteration $x, learning rate is $this_learning_rate." if [ ! -z "${realign_this_iter[$x]}" ]; then prev_egs_dir=$cur_egs_dir @@ -466,7 +456,7 @@ while [ $x -lt $num_iters ]; do steps/nnet3/remove_egs.sh $prev_egs_dir fi fi - + # Set off jobs doing some diagnostics, in the background. # Use the egs dir from the previous iteration for the diagnostics $cmd $dir/log/compute_prob_valid.$x.log \ @@ -476,13 +466,12 @@ while [ $x -lt $num_iters ]; do nnet3-compute-prob "nnet3-am-copy --raw=true $dir/$x.mdl - |" \ "ark:nnet3-merge-egs ark:$cur_egs_dir/train_diagnostic.egs ark:- |" & - # nnet3-show-progress not implemented yet - #if [ $x -gt 0 ] && [ ! -f $dir/log/mix_up.$[$x-1].log ]; then - # $cmd $dir/log/progress.$x.log \ - # nnet3-show-progress --use-gpu=no $dir/$[$x-1].mdl $dir/$x.mdl \ - # ark:$cur_egs_dir/train_diagnostic.egs '&&' \ - # nnet3-info $dir/$x.mdl & - #fi + if [ $x -gt 0 ]; then + $cmd $dir/log/progress.$x.log \ + nnet3-show-progress --use-gpu=no "nnet3-am-copy --raw=true $dir/$[$x-1].mdl - |" "nnet3-am-copy --raw=true $dir/$x.mdl - |" \ + "ark:nnet3-merge-egs ark:$cur_egs_dir/train_diagnostic.egs ark:-|" '&&' \ + nnet3-info "nnet3-am-copy --raw=true $dir/$x.mdl - |" & + fi echo "Training neural net (pass $x)" @@ -516,7 +505,7 @@ while [ $x -lt $num_iters ]; do ( # this sub-shell is so that when we "wait" below, # we only wait for the training jobs that we just spawned, # not the diagnostic jobs that we spawned above. - + # We can't easily use a single parallel SGE job to do the main training, # because the computation of which archive and which --frame option # to use for each job is a little complex, so we spawn each one separately. @@ -530,8 +519,9 @@ while [ $x -lt $num_iters ]; do # so we want to separate them in time. $cmd $train_queue_opt $dir/log/train.$x.$n.log \ - nnet3-train$parallel_suffix $parallel_train_opts "$raw" \ - "ark:nnet3-copy-egs --frame=$frame $context_opts ark:$cur_egs_dir/egs.$archive.ark ark:- | nnet3-shuffle-egs --buffer-size=$shuffle_buffer_size --srand=$x ark:- ark:-| nnet3-merge-egs --minibatch-size=$this_minibatch_size ark:- ark:- |" \ + nnet3-train $parallel_train_opts \ + --max-param-change=$max_param_change "$raw" \ + "ark:nnet3-copy-egs --frame=$frame $context_opts ark:$cur_egs_dir/egs.$archive.ark ark:- | nnet3-shuffle-egs --buffer-size=$shuffle_buffer_size --srand=$x ark:- ark:-| nnet3-merge-egs --minibatch-size=$this_minibatch_size --discard-partial-minibatches=true ark:- ark:- |" \ $dir/$[$x+1].$n.raw || touch $dir/.error & done wait @@ -555,7 +545,7 @@ while [ $x -lt $num_iters ]; do n=$(perl -e '($nj,$pat)=@ARGV; $best_n=1; $best_logprob=-1.0e+10; for ($n=1;$n<=$nj;$n++) { $fn = sprintf($pat,$n); open(F, "<$fn") || die "Error opening log file $fn"; undef $logprob; while () { if (m/log-prob-per-frame=(\S+)/) { $logprob=$1; } } - close(F); if (defined $logprob && $logprob > $best_logprob) { $best_logprob=$logprob; + close(F); if (defined $logprob && $logprob > $best_logprob) { $best_logprob=$logprob; $best_n=$n; } } print "$best_n\n"; ' $num_jobs_nnet $dir/log/train.$x.%d.log) || exit 1; [ -z "$n" ] && echo "Error getting best model" && exit 1; $cmd $dir/log/select.$x.log \ @@ -613,9 +603,11 @@ fi if [ $stage -le $[$num_iters+1] ]; then echo "Getting average posterior for purposes of adjusting the priors." # Note: this just uses CPUs, using a smallish subset of data. + if [ $num_jobs_compute_prior -gt $num_archives ]; then egs_part=1; + else egs_part=JOB; fi rm $dir/post.$x.*.vec 2>/dev/null $cmd JOB=1:$num_jobs_compute_prior $prior_queue_opt $dir/log/get_post.$x.JOB.log \ - nnet3-copy-egs --frame=random $context_opts --srand=JOB ark:$cur_egs_dir/egs.1.ark ark:- \| \ + nnet3-copy-egs --frame=random $context_opts --srand=JOB ark:$cur_egs_dir/egs.$egs_part.ark ark:- \| \ nnet3-subset-egs --srand=JOB --n=$prior_subset_size ark:- ark:- \| \ nnet3-merge-egs ark:- ark:- \| \ nnet3-compute-from-egs $prior_gpu_opt --apply-exp=true \ diff --git a/egs/wsj/s5/steps/online/nnet2/copy_data_dir.sh b/egs/wsj/s5/steps/online/nnet2/copy_data_dir.sh index 78b5b1bde2f..b4e70fc6af0 100755 --- a/egs/wsj/s5/steps/online/nnet2/copy_data_dir.sh +++ b/egs/wsj/s5/steps/online/nnet2/copy_data_dir.sh @@ -9,7 +9,6 @@ # versions, so that each speaker has no more than --utts-per-spk-max # utterances. - # begin configuration section utts_per_spk_max=-1 # end configuration section @@ -34,7 +33,7 @@ srcdir=$1 destdir=$2 if [ ! -f $srcdir/utt2spk ]; then - echo "$0: no such file $srcdir/utt2spk" + echo "$0: no such file $srcdir/utt2spk" exit 1; fi @@ -81,5 +80,6 @@ echo "$0: copied data from $srcdir to $destdir, with --utts-per-spk-max $utts_pe opts= [ ! -f $srcdir/feats.scp ] && opts="--no-feats" [ ! -f $srcdir/text ] && opts="$opts --no-text" +[ ! -f $srcdir/wav.scp ] && opts="$opts --no-wav" utils/validate_data_dir.sh $opts $destdir diff --git a/egs/wsj/s5/steps/online/nnet2/extract_ivectors.sh b/egs/wsj/s5/steps/online/nnet2/extract_ivectors.sh index 924a2f20eaf..f27baecd673 100755 --- a/egs/wsj/s5/steps/online/nnet2/extract_ivectors.sh +++ b/egs/wsj/s5/steps/online/nnet2/extract_ivectors.sh @@ -258,7 +258,7 @@ base_feat_dim=$(feat-to-dim scp:$data/feats.scp -) || exit 1; start_dim=$base_feat_dim end_dim=$[$base_feat_dim+$ivector_dim-1] - +absdir=$(readlink -f $dir) if [ $stage -le 4 ]; then # here, we are just using the original features in $sdata/JOB/feats.scp for @@ -269,7 +269,7 @@ if [ $stage -le 4 ]; then select-feats "$start_dim-$end_dim" ark:- ark:- \| \ subsample-feats --n=$ivector_period ark:- ark:- \| \ copy-feats --compress=$compress ark:- \ - ark,scp:$dir/ivector_online.JOB.ark,$dir/ivector_online.JOB.scp || exit 1; + ark,scp:$absdir/ivector_online.JOB.ark,$absdir/ivector_online.JOB.scp || exit 1; fi if [ $stage -le 5 ]; then diff --git a/egs/wsj/s5/steps/online/nnet2/extract_ivectors_online.sh b/egs/wsj/s5/steps/online/nnet2/extract_ivectors_online.sh index 81d8a3219dc..d8ac11da720 100755 --- a/egs/wsj/s5/steps/online/nnet2/extract_ivectors_online.sh +++ b/egs/wsj/s5/steps/online/nnet2/extract_ivectors_online.sh @@ -14,7 +14,7 @@ # for online decoding. # Rather than treating each utterance separately, it carries forward -# information from one utterance to the next, within the speaker. +# information from one utterance to the next, within the speaker. # Begin configuration section. @@ -45,7 +45,6 @@ max_count=0 # The use of this option (e.g. --max-count 100) can make # End configuration section. echo "$0 $@" # Print the command line for logging - if [ -f path.sh ]; then . ./path.sh; fi . parse_options.sh || exit 1; @@ -56,7 +55,7 @@ if [ $# != 3 ]; then echo "main options (for others, see top of script file)" echo " --config # config containing options" echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs." - echo " --nj # Number of jobs (also see num-processes and num-threads)" + echo " --nj # Number of jobs" echo " --stage # To control partial reruns" echo " --num-gselect # Number of Gaussians to select using" echo " # diagonal model." @@ -94,6 +93,7 @@ echo -n >$ieconf cp $srcdir/online_cmvn.conf $dir/conf/ || exit 1; echo "--cmvn-config=$dir/conf/online_cmvn.conf" >>$ieconf for x in $(echo $splice_opts); do echo "$x"; done > $dir/conf/splice.conf +echo "--ivector-period=$ivector_period" >>$ieconf echo "--splice-config=$dir/conf/splice.conf" >>$ieconf echo "--lda-matrix=$srcdir/final.mat" >>$ieconf echo "--global-cmvn-stats=$srcdir/global_cmvn.stats" >>$ieconf @@ -106,6 +106,7 @@ echo "--max-remembered-frames=1000" >>$ieconf # the default echo "--max-count=$max_count" >>$ieconf +absdir=$(readlink -f $dir) for n in $(seq $nj); do # This will do nothing unless the directory $dir/storage exists; @@ -118,7 +119,7 @@ if [ $stage -le 0 ]; then $cmd JOB=1:$nj $dir/log/extract_ivectors.JOB.log \ ivector-extract-online2 --config=$ieconf ark:$sdata/JOB/spk2utt scp:$sdata/JOB/feats.scp ark:- \| \ copy-feats --compress=$compress ark:- \ - ark,scp:$dir/ivector_online.JOB.ark,$dir/ivector_online.JOB.scp || exit 1; + ark,scp:$absdir/ivector_online.JOB.ark,$absdir/ivector_online.JOB.scp || exit 1; fi if [ $stage -le 1 ]; then diff --git a/egs/wsj/s5/steps/online/nnet3/decode.sh b/egs/wsj/s5/steps/online/nnet3/decode.sh new file mode 100755 index 00000000000..af8a33f3ac3 --- /dev/null +++ b/egs/wsj/s5/steps/online/nnet3/decode.sh @@ -0,0 +1,149 @@ +#!/bin/bash + +# Copyright 2014 Johns Hopkins University (Author: Daniel Povey) +# 2016 Api.ai (Author: Ilya Platonov) +# Apache 2.0 + +# Begin configuration section. +stage=0 +nj=4 +cmd=run.pl +max_active=7000 +beam=15.0 +lattice_beam=6.0 +acwt=0.1 # note: only really affects adaptation and pruning (scoring is on + # lattices). +post_decode_acwt=1.0 # can be used in 'chain' systems to scale acoustics by 10 so the + # regular scoring script works. +per_utt=false +online=true # only relevant to non-threaded decoder. +do_endpointing=false +do_speex_compressing=false +scoring_opts= +skip_scoring=false +silence_weight=1.0 # set this to a value less than 1 (e.g. 0) to enable silence weighting. +max_state_duration=40 # This only has an effect if you are doing silence + # weighting. This default is probably reasonable. transition-ids repeated + # more than this many times in an alignment are treated as silence. +iter=final +online_config= +# End configuration section. + +echo "$0 $@" # Print the command line for logging + +[ -f ./path.sh ] && . ./path.sh; # source the path. +. parse_options.sh || exit 1; + +if [ $# != 3 ]; then + echo "Usage: $0 [options] " + echo "... where is assumed to be a sub-directory of the directory" + echo " where the models are, as prepared by steps/online/nnet3/prepare_online_decoding.sh" + echo "e.g.: $0 exp/chain/tdnn/graph data/test exp/chain/tdnn_online/decode/" + echo "" + echo "" + echo "main options (for others, see top of script file)" + echo " --config # config containing options" + echo " --online-config # online decoder options" + echo " --nj # number of parallel jobs" + echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs." + echo " --acwt # acoustic scale used for lattice generation " + echo " --per-utt # If true, decode per utterance without" + echo " # carrying forward adaptation info from previous" + echo " # utterances of each speaker. Default: false" + echo " --online # Set this to false if you don't really care about" + echo " # simulating online decoding and just want the best" + echo " # results. This will use all the data within each" + echo " # utterance (plus any previous utterance, if not in" + echo " # per-utterance mode) to estimate the iVectors." + echo " --scoring-opts # options to local/score.sh" + echo " --iter # Iteration of model to decode; default is final." + exit 1; +fi + + +graphdir=$1 +data=$2 +dir=$3 +srcdir=`dirname $dir`; # The model directory is one level up from decoding directory. +sdata=$data/split$nj; + +if [ "$online_config" == "" ]; then + online_config=$srcdir/conf/online.conf; +fi + +mkdir -p $dir/log +[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1; +echo $nj > $dir/num_jobs + +for f in $online_config $srcdir/${iter}.mdl \ + $graphdir/HCLG.fst $graphdir/words.txt $data/wav.scp; do + if [ ! -f $f ]; then + echo "$0: no such file $f" + exit 1; + fi +done + +if ! $per_utt; then + spk2utt_rspecifier="ark:$sdata/JOB/spk2utt" +else + mkdir -p $dir/per_utt + for j in $(seq $nj); do + awk '{print $1, $1}' <$sdata/$j/utt2spk >$dir/per_utt/utt2spk.$j || exit 1; + done + spk2utt_rspecifier="ark:$dir/per_utt/utt2spk.JOB" +fi + +if [ -f $data/segments ]; then + wav_rspecifier="ark,s,cs:extract-segments scp,p:$sdata/JOB/wav.scp $sdata/JOB/segments ark:- |" +else + wav_rspecifier="ark,s,cs:wav-copy scp,p:$sdata/JOB/wav.scp ark:- |" +fi +if $do_speex_compressing; then + wav_rspecifier="$wav_rspecifier compress-uncompress-speex ark:- ark:- |" +fi +if $do_endpointing; then + wav_rspecifier="$wav_rspecifier extend-wav-with-silence ark:- ark:- |" +fi + +if [ "$silence_weight" != "1.0" ]; then + silphones=$(cat $graphdir/phones/silence.csl) || exit 1 + silence_weighting_opts="--ivector-silence-weighting.max-state-duration=$max_state_duration --ivector-silence-weighting.silence_phones=$silphones --ivector-silence-weighting.silence-weight=$silence_weight" +else + silence_weighting_opts= +fi + + +decoder=online2-wav-nnet3-latgen-faster +parallel_opts= +opts="--online=$online" + + +if [ "$post_decode_acwt" == 1.0 ]; then + lat_wspecifier="ark:|gzip -c >$dir/lat.JOB.gz" +else + lat_wspecifier="ark:|lattice-scale --acoustic-scale=$post_decode_acwt ark:- ark:- | gzip -c >$dir/lat.JOB.gz" +fi + + +if [ -f $srcdir/frame_subsampling_factor ]; then + # e.g. for 'chain' systems + frame_subsampling_opt="--frame-subsampling-factor=$(cat $srcdir/frame_subsampling_factor)" +fi + +if [ $stage -le 0 ]; then + $cmd $parallel_opts JOB=1:$nj $dir/log/decode.JOB.log \ + $decoder $opts $silence_weighting_opts --do-endpointing=$do_endpointing $frame_subsampling_opt \ + --config=$online_config \ + --max-active=$max_active --beam=$beam --lattice-beam=$lattice_beam \ + --acoustic-scale=$acwt --word-symbol-table=$graphdir/words.txt \ + $srcdir/${iter}.mdl $graphdir/HCLG.fst $spk2utt_rspecifier "$wav_rspecifier" \ + "$lat_wspecifier" || exit 1; +fi + +if ! $skip_scoring ; then + [ ! -x local/score.sh ] && \ + echo "Not scoring because local/score.sh does not exist or not executable." && exit 1; + local/score.sh --cmd "$cmd" $scoring_opts $data $graphdir $dir +fi + +exit 0; diff --git a/egs/wsj/s5/steps/online/nnet3/prepare_online_decoding.sh b/egs/wsj/s5/steps/online/nnet3/prepare_online_decoding.sh new file mode 100755 index 00000000000..c7d7156068f --- /dev/null +++ b/egs/wsj/s5/steps/online/nnet3/prepare_online_decoding.sh @@ -0,0 +1,168 @@ +#!/bin/bash + +# Copyright 2014 Johns Hopkins University (Author: Daniel Povey) +# Apache 2.0 + +# Begin configuration. +stage=0 # This allows restarting after partway, when something when wrong. +feature_type=mfcc +add_pitch=false +mfcc_config=conf/mfcc.conf # you can override any of these you need to override. +plp_config=conf/plp.conf +fbank_config=conf/fbank.conf +# online_pitch_config is the config file for both pitch extraction and +# post-processing; we combine them into one because during training this +# is given to the program compute-and-process-kaldi-pitch-feats. +online_pitch_config=conf/online_pitch.conf + +# Below are some options that affect the iVectors, and should probably +# match those used in extract_ivectors_online.sh. +num_gselect=5 # Gaussian-selection using diagonal model: number of Gaussians to select +posterior_scale=0.1 # Scale on the acoustic posteriors, intended to account for + # inter-frame correlations. +min_post=0.025 # Minimum posterior to use (posteriors below this are pruned out) + # caution: you should use the same value in the online-estimation + # code. +max_count=100 # This max-count of 100 can make iVectors more consistent for + # different lengths of utterance, by scaling up the prior term + # when the data-count exceeds this value. The data-count is + # after posterior-scaling, so assuming the posterior-scale is + # 0.1, --max-count 100 starts having effect after 1000 frames, + # or 10 seconds of data. +iter=final +# End configuration. + +echo "$0 $@" # Print the command line for logging + +[ -f path.sh ] && . ./path.sh; +. parse_options.sh || exit 1; + +if [ $# -ne 4 ] && [ $# -ne 3 ]; then + echo "Usage: $0 [options] [] " + echo "e.g.: $0 data/lang exp/nnet2_online/extractor exp/nnet2_online/nnet exp/nnet2_online/nnet_online" + echo "main options (for others, see top of script file)" + echo " --feature-type # Type of the base features; " + echo " # important to generate the correct" + echo " # configs in /conf/" + echo " --add-pitch # Append pitch features to cmvn" + echo " # (default: false)" + echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs." + echo " --config # config containing options" + echo " --iter # iteration of model to take." + echo " --stage # stage to do partial re-run from." + exit 1; +fi + + +if [ $# -eq 4 ]; then + lang=$1 + iedir=$2 + srcdir=$3 + dir=$4 +else + [ $# -eq 3 ] || exit 1; + lang=$1 + iedir= + srcdir=$2 + dir=$3 +fi + +for f in $lang/phones/silence.csl $srcdir/${iter}.mdl $srcdir/tree; do + [ ! -f $f ] && echo "$0: no such file $f" && exit 1; +done +if [ ! -z "$iedir" ]; then + for f in final.{mat,ie,dubm} splice_opts global_cmvn.stats online_cmvn.conf; do + [ ! -f $iedir/$f ] && echo "$0: no such file $iedir/$f" && exit 1; + done +fi + + +dir=$(readlink -f $dir) # Convert $dir to an absolute pathname, so that the + # configuration files we write will contain absolute + # pathnames. +mkdir -p $dir/conf + + +cp $srcdir/${iter}.mdl $dir/final.mdl || exit 1; +cp $srcdir/tree $dir/ || exit 1; +if [ -f $srcdir/frame_subsampling_factor ]; then + cp $srcdir/frame_subsampling_factor $dir/ +fi + +if [ ! -z "$iedir" ]; then + mkdir -p $dir/ivector_extractor/ + cp $iedir/final.{mat,ie,dubm} $iedir/global_cmvn.stats $dir/ivector_extractor/ || exit 1; + + # The following things won't be needed directly by the online decoding, but + # will allow us to run prepare_online_decoding.sh again with + # $dir/ivector_extractor/ as the input directory (useful in certain + # cross-system training scenarios). + cp $iedir/splice_opts $iedir/online_cmvn.conf $dir/ivector_extractor/ || exit 1; +fi + + +mkdir -p $dir/conf +rm $dir/{plp,mfcc,fbank}.conf 2>/dev/null +echo "$0: preparing configuration files in $dir/conf" + +if [ -f $dir/conf/online.conf ]; then + echo "$0: moving $dir/conf/online.conf to $dir/conf/online.conf.bak" + mv $dir/conf/online.conf $dir/conf/online.conf.bak +fi + +conf=$dir/conf/online.conf +echo -n >$conf + +echo "--feature-type=$feature_type" >>$conf + +case "$feature_type" in + mfcc) + echo "--mfcc-config=$dir/conf/mfcc.conf" >>$conf + cp $mfcc_config $dir/conf/mfcc.conf || exit 1;; + plp) + echo "--plp-config=$dir/conf/plp.conf" >>$conf + cp $plp_config $dir/conf/plp.conf || exit 1;; + fbank) + echo "--fbank-config=$dir/conf/fbank.conf" >>$conf + cp $fbank_config $dir/conf/fbank.conf || exit 1;; + *) + echo "Unknown feature type $feature_type" +esac + + + +if [ ! -z "$iedir" ]; then + ieconf=$dir/conf/ivector_extractor.conf + echo -n >$ieconf + echo "--ivector-extraction-config=$ieconf" >>$conf + cp $iedir/online_cmvn.conf $dir/conf/online_cmvn.conf || exit 1; + # the next line puts each option from splice_opts on its own line in the config. + for x in $(cat $iedir/splice_opts); do echo "$x"; done > $dir/conf/splice.conf + echo "--splice-config=$dir/conf/splice.conf" >>$ieconf + echo "--cmvn-config=$dir/conf/online_cmvn.conf" >>$ieconf + echo "--lda-matrix=$dir/ivector_extractor/final.mat" >>$ieconf + echo "--global-cmvn-stats=$dir/ivector_extractor/global_cmvn.stats" >>$ieconf + echo "--diag-ubm=$dir/ivector_extractor/final.dubm" >>$ieconf + echo "--ivector-extractor=$dir/ivector_extractor/final.ie" >>$ieconf + echo "--num-gselect=$num_gselect" >>$ieconf + echo "--min-post=$min_post" >>$ieconf + echo "--posterior-scale=$posterior_scale" >>$ieconf # this is currently the default in the scripts. + echo "--max-remembered-frames=1000" >>$ieconf # the default + echo "--max-count=$max_count" >>$ieconf +fi + +if $add_pitch; then + echo "$0: enabling pitch features" + echo "--add-pitch=true" >>$conf + echo "$0: creating $dir/conf/online_pitch.conf" + if [ ! -f $online_pitch_config ]; then + echo "$0: expected file '$online_pitch_config' to exist."; + exit 1; + fi + cp $online_pitch_config $dir/conf/online_pitch.conf || exit 1; + echo "--online-pitch-config=$dir/conf/online_pitch.conf" >>$conf +fi + +silphonelist=`cat $lang/phones/silence.csl` || exit 1; +echo "--endpoint.silence-phones=$silphonelist" >>$conf +echo "$0: created config file $conf" diff --git a/egs/wsj/s5/steps/paste_feats.sh b/egs/wsj/s5/steps/paste_feats.sh index da82179f616..abeee5aba23 100755 --- a/egs/wsj/s5/steps/paste_feats.sh +++ b/egs/wsj/s5/steps/paste_feats.sh @@ -44,10 +44,10 @@ done mkdir -p $ark_dir $logdir -mkdir -p $data +mkdir -p $data cp $data_src_first/* $data/ 2>/dev/null # so we get the other files, such as utt2spk. -rm $data/cmvn.scp 2>/dev/null -rm $data/feats.scp 2>/dev/null +rm $data/cmvn.scp 2>/dev/null +rm $data/feats.scp 2>/dev/null # use "name" as part of name of the archive. name=`basename $data` @@ -58,19 +58,25 @@ for data_src in ${data_src_arr[@]}; do data_src_args="$data_src_args scp:$data_src/split$nj/JOB/feats.scp" done +for n in $(seq $nj); do + # the next command does nothing unless $arkdir/storage/ exists, see + # utils/create_data_link.pl for more info. + utils/create_data_link.pl $arkdir/pasted_$name.$n.ark +done + $cmd JOB=1:$nj $logdir/append.JOB.log \ paste-feats --length-tolerance=$length_tolerance $data_src_args ark:- \| \ copy-feats --compress=$compress ark:- \ ark,scp:$ark_dir/pasted_$name.JOB.ark,$ark_dir/pasted_$name.JOB.scp || exit 1; - + # concatenate the .scp files together. for ((n=1; n<=nj; n++)); do cat $ark_dir/pasted_$name.$n.scp >> $data/feats.scp || exit 1; done > $data/feats.scp || exit 1; -nf=`cat $data/feats.scp | wc -l` -nu=`cat $data/utt2spk | wc -l` +nf=`cat $data/feats.scp | wc -l` +nu=`cat $data/utt2spk | wc -l` if [ $nf -ne $nu ]; then echo "It seems not all of the feature files were successfully processed ($nf != $nu);" echo "consider using utils/fix_data_dir.sh $data" diff --git a/egs/wsj/s5/steps/rnnlmrescore.sh b/egs/wsj/s5/steps/rnnlmrescore.sh index c1302e2beed..2cb6700432a 100755 --- a/egs/wsj/s5/steps/rnnlmrescore.sh +++ b/egs/wsj/s5/steps/rnnlmrescore.sh @@ -1,5 +1,6 @@ #!/bin/bash +# please see lmrescore_rnnlm_lat.sh which is a newer script using lattices. # Begin configuration section. N=10 @@ -104,12 +105,14 @@ if [ "$oldlm" == "$oldlang/G.fst" ]; then if [ $stage -le 2 ]; then echo "$0: removing old LM scores." # Use the phi-matcher style of composition.. this is appropriate - # if the old LM scores were added e.g. by lmrescore.sh, using + # if the old LM scores were added e.g. by lmrescore.sh, using # phi-matcher composition. $cmd JOB=1:$nj $dir/log/remove_old.JOB.log \ - lattice-compose --phi-label=$phi "ark:gunzip -c $dir/nbest1.JOB.gz|" $oldlm \ - "ark:|gzip -c >$dir/nbest2.JOB.gz" || exit 1; - fi + lattice-scale --acoustic-scale=-1 --lm-scale=-1 "ark:gunzip -c $dir/nbest1.JOB.gz|" ark:- \| \ + lattice-compose --phi-label=$phi ark:- $oldlm ark:- \| \ + lattice-scale --acoustic-scale=-1 --lm-scale=-1 ark:- "ark:|gzip -c >$dir/nbest2.JOB.gz" \ + || exit 1; + fi else if [ $stage -le 2 ]; then echo "$0: removing old LM scores." @@ -187,7 +190,7 @@ if [ $stage -le 7 ]; then echo "$0: reconstructing total LM+graph scores including interpolation of RNNLM and old LM scores." for n in `seq $nj`; do paste $adir.$n/lmwt.nolm $adir.$n/lmwt.lmonly $adir.$n/lmwt.rnn | awk -v rnnweight=$rnnweight \ - '{ key=$1; graphscore=$2; lmscore=$4; rnnscore=$6; + '{ key=$1; graphscore=$2; lmscore=$4; rnnscore=$6; score = graphscore+(rnnweight*rnnscore)+((1-rnnweight)*lmscore); print $1,score; } ' > $adir.$n/lmwt.interp.$rnnweight || exit 1; done diff --git a/egs/wsj/s5/steps/score_kaldi.sh b/egs/wsj/s5/steps/score_kaldi.sh index 5ed223d0312..36fc0e429bc 100755 --- a/egs/wsj/s5/steps/score_kaldi.sh +++ b/egs/wsj/s5/steps/score_kaldi.sh @@ -14,6 +14,7 @@ beam=6 word_ins_penalty=0.0,0.5,1.0 min_lmwt=9 max_lmwt=20 +iter=final #end configuration section. echo "$0 $@" # Print the command line for logging @@ -137,12 +138,18 @@ if [ $stage -le 1 ]; then cat $dir/scoring_kaldi/wer_details/per_utt \| \ utils/scoring/wer_ops_details.pl --special-symbol "'***'" \| \ sort -b -i -k 1,1 -k 4,4rn -k 2,2 -k 3,3 \> $dir/scoring_kaldi/wer_details/ops || exit 1; + + $cmd $dir/scoring_kaldi/log/wer_bootci.log \ + compute-wer-bootci \ + ark:$dir/scoring_kaldi/test_filt.txt ark:$dir/scoring_kaldi/penalty_$best_wip/$best_lmwt.txt \ + '>' $dir/scoring_kaldi/wer_details/wer_bootci || exit 1; + fi fi # If we got here, the scoring was successful. # As a small aid to prevent confusion, we remove all wer_{?,??} files; -# these originate from the previous version of the scoring files +# these originate from the previous version of the scoring files rm $dir/wer_{?,??} 2>/dev/null exit 0; diff --git a/egs/wsj/s5/steps/score_kaldi_compare.sh b/egs/wsj/s5/steps/score_kaldi_compare.sh new file mode 100755 index 00000000000..91fc057b906 --- /dev/null +++ b/egs/wsj/s5/steps/score_kaldi_compare.sh @@ -0,0 +1,50 @@ +#!/bin/bash +# Copyright 2016 Nicolas Serrano +# Apache 2.0 + +[ -f ./path.sh ] && . ./path.sh + +# begin configuration section. +cmd=run.pl +replications=10000 +#end configuration section. + +echo "$0 $@" # Print the command line for logging +[ -f ./path.sh ] && . ./path.sh +. parse_options.sh || exit 1; + +if [ $# -ne 3 ]; then + echo "Usage: local/score_compare.sh [--cmd (run.pl|queue.pl...)] " + echo " Options:" + echo " --cmd (run.pl|queue.pl...) # specify how to run the sub-processes." + echo " --replications # number of bootstrap evaluation to compute confidence." + exit 1; +fi + +dir1=$1 +dir2=$2 +dir_compare=$3 + +mkdir -p $dir_compare/log + +for d in $dir1 $dir2; do + for f in test_filt.txt best_wer; do + [ ! -f $d/$f ] && echo "score_compare.sh: no such file $d/$f" && exit 1; + done +done + + +best_wer_file1=$(awk '{print $NF}' $dir1/best_wer) +best_transcript_file1=$(echo $best_wer_file1 | sed -e 's=.*/wer_==' | \ + awk -v FS='_' -v dir=$dir1 '{print dir"/penalty_"$2"/"$1".txt"}') + +best_wer_file2=$(awk '{print $NF}' $dir2/best_wer) +best_transcript_file2=$(echo $best_wer_file2 | sed -e 's=.*/wer_==' | \ + awk -v FS='_' -v dir=$dir2 '{print dir"/penalty_"$2"/"$1".txt"}') + +$cmd $dir_compare/log/score_compare.log \ + compute-wer-bootci --replications=$replications \ + ark:$dir1/test_filt.txt ark:$best_transcript_file1 ark:$best_transcript_file2 \ + '>' $dir_compare/wer_bootci_comparison || exit 1; + +exit 0; diff --git a/egs/wsj/s5/steps/select_feats.sh b/egs/wsj/s5/steps/select_feats.sh index 970823fdf25..072dd3194cf 100755 --- a/egs/wsj/s5/steps/select_feats.sh +++ b/egs/wsj/s5/steps/select_feats.sh @@ -43,31 +43,31 @@ mkdir -p $ark_dir $logdir mkdir -p $data cp $data_in/* $data/ 2>/dev/null # so we get the other files, such as utt2spk. -rm $data/cmvn.scp 2>/dev/null -rm $data/feats.scp 2>/dev/null +rm $data/cmvn.scp 2>/dev/null +rm $data/feats.scp 2>/dev/null # use "name" as part of name of the archive. name=`basename $data` -for j in $(seq $nj); do +for j in $(seq $nj); do # the next command does nothing unless $mfccdir/storage/ exists, see # utils/create_data_link.pl for more info. - utils/create_data_link.pl $ark_dir/pasted_$name.$j.ark + utils/create_data_link.pl $ark_dir/selected_$name.$j.ark done $cmd JOB=1:$nj $logdir/append.JOB.log \ select-feats "$selector" scp:$data_in/split$nj/JOB/feats.scp ark:- \| \ copy-feats --compress=$compress ark:- \ - ark,scp:$ark_dir/pasted_$name.JOB.ark,$ark_dir/pasted_$name.JOB.scp || exit 1; - + ark,scp:$ark_dir/selected_$name.JOB.ark,$ark_dir/selected_$name.JOB.scp || exit 1; + # concatenate the .scp files together. for ((n=1; n<=nj; n++)); do - cat $ark_dir/pasted_$name.$n.scp >> $data/feats.scp || exit 1; + cat $ark_dir/selected_$name.$n.scp >> $data/feats.scp || exit 1; done > $data/feats.scp || exit 1; -nf=`cat $data/feats.scp | wc -l` -nu=`cat $data/utt2spk | wc -l` +nf=`cat $data/feats.scp | wc -l` +nu=`cat $data/utt2spk | wc -l` if [ $nf -ne $nu ]; then echo "It seems not all of the feature files were successfully processed ($nf != $nu);" exit 1; diff --git a/egs/wsj/s5/steps/shift_feats.sh b/egs/wsj/s5/steps/shift_feats.sh new file mode 100755 index 00000000000..9ad85368c3f --- /dev/null +++ b/egs/wsj/s5/steps/shift_feats.sh @@ -0,0 +1,85 @@ +#!/bin/bash + +# Copyright 2016 Vimal Manohar +# Apache 2.0 + +# This script shifts the feats in the input data directory and creates a +# new directory _fs with shifted feats. +# If the shift is negative, the initial frames get truncated. +# If the shift is positive, the first frame is repeated. +# Usually applicable for sequence training + +# To be run from .. (one directory up from here) +# see ../run.sh for example + +# Begin configuration section. +cmd=run.pl +nj=4 +compress=true +# End configuration section. + +echo "$0 $@" # Print the command line for logging + +if [ -f path.sh ]; then . ./path.sh; fi +. parse_options.sh || exit 1; + +if [ $# -ne 4 ]; then + echo "usage: $0 [options] "; + echo "e.g.: $0 -1 data/train exp/shift-1_train mfcc" + echo "options: " + echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs." + exit 1; +fi + +num_frames_shift=$1 +data_in=$2 +logdir=$3 +featdir=$4 + +utt_prefix="fs$num_frames_shift-" +spk_prefix="fs$num_frames_shift-" + +# make $featdir an absolute pathname. +featdir=`perl -e '($dir,$pwd)= @ARGV; if($dir!~m:^/:) { $dir = "$pwd/$dir"; } print $dir; ' $featdir ${PWD}` + +utils/split_data.sh $data_in $nj || exit 1; + +data=${data_in}_fs$num_frames_shift + +mkdir -p $featdir $logdir +mkdir -p $data + +utils/copy_data_dir.sh --utt-prefix $utt_prefix --spk-prefix $spk_prefix \ + $data_in $data + +rm $data/feats.scp 2>/dev/null + +# use "name" as part of name of the archive. +name=`basename $data` + +for j in $(seq $nj); do + # the next command does nothing unless $mfccdir/storage/ exists, see + # utils/create_data_link.pl for more info. + utils/create_data_link.pl $featdir/raw_feats_$name.$j.ark +done + +$cmd JOB=1:$nj $logdir/shift.JOB.log \ + shift-feats --shift=$num_frames_shift \ + scp:$data_in/split$nj/JOB/feats.scp ark:- \| \ + copy-feats --compress=$compress ark:- \ + ark,scp:$featdir/raw_feats_$name.JOB.ark,$featdir/raw_feats_$name.JOB.scp || exit 1; + +# concatenate the .scp files together. +for ((n=1; n<=nj; n++)); do + cat $featdir/raw_feats_$name.$n.scp +done | awk -v nfs=$num_frames_shift '{print "fs"nfs"-"$0}'>$data/feats.scp || exit 1; + +nf=`cat $data/feats.scp | wc -l` +nu=`cat $data/utt2spk | wc -l` +if [ $nf -ne $nu ]; then + echo "It seems not all of the feature files were successfully processed ($nf != $nu);" + exit 1; +fi + +echo "Succeeded shifting features for $name into $data" + diff --git a/egs/wsj/s5/steps/train_diag_ubm.sh b/egs/wsj/s5/steps/train_diag_ubm.sh index 5ec4696c75c..5cac8c462da 100755 --- a/egs/wsj/s5/steps/train_diag_ubm.sh +++ b/egs/wsj/s5/steps/train_diag_ubm.sh @@ -53,6 +53,7 @@ silphonelist=`cat $lang/phones/silence.csl` || exit 1; sdata=$data/split$nj splice_opts=`cat $alidir/splice_opts 2>/dev/null` cmvn_opts=`cat $alidir/cmvn_opts 2>/dev/null` +delta_opts=`cat $alidir/delta_opts 2>/dev/null` mkdir -p $dir/log [[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1; echo $nj > $dir/num_jobs @@ -61,7 +62,7 @@ if [ -f $alidir/final.mat ]; then feat_type=lda; else feat_type=delta; fi echo "$0: feature type is $feat_type" case $feat_type in - delta) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";; + delta) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas $delta_opts ark:- ark:- |";; lda) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $alidir/final.mat ark:- ark:- |" cp $alidir/final.mat $dir ;; diff --git a/egs/wsj/s5/steps/train_lda_mllt.sh b/egs/wsj/s5/steps/train_lda_mllt.sh index c8522985a6d..f8f05c87f92 100755 --- a/egs/wsj/s5/steps/train_lda_mllt.sh +++ b/egs/wsj/s5/steps/train_lda_mllt.sh @@ -1,6 +1,13 @@ #!/bin/bash # Copyright 2012 Johns Hopkins University (Author: Daniel Povey) +# +# LDA+MLLT refers to the way we transform the features after computing +# the MFCCs: we splice across several frames, reduce the dimension (to 40 +# by default) using Linear Discriminant Analysis), and then later estimate, +# over multiple iterations, a diagonalizing transform known as MLLT or CTC. +# See http://kaldi.sourceforge.net/transform.html for more explanation. +# # Apache 2.0. # Begin configuration. @@ -85,7 +92,7 @@ feats="$splicedfeats transform-feats $dir/0.mat ark:- ark:- |" if [ $stage -le -5 ]; then if [ -z "$use_lda_mat" ]; then echo "Accumulating LDA statistics." - rm $dir/lda.*.acc + rm $dir/lda.*.acc 2>/dev/null $cmd JOB=1:$nj $dir/log/lda_acc.JOB.log \ ali-to-post "ark:gunzip -c $alidir/ali.JOB.gz|" ark:- \| \ weight-silence-post 0.0 $silphonelist $alidir/final.mdl ark:- ark:- \| \ @@ -204,7 +211,7 @@ while [ $x -lt $num_iters ]; do $cmd $dir/log/update.$x.log \ gmm-est --write-occs=$dir/$[$x+1].occs --mix-up=$numgauss --power=$power \ $dir/$x.mdl "gmm-sum-accs - $dir/$x.*.acc |" $dir/$[$x+1].mdl || exit 1; - rm $dir/$x.mdl $dir/$x.*.acc $dir/$x.occs + rm $dir/$x.mdl $dir/$x.*.acc $dir/$x.occs fi [ $x -le $max_iter_inc ] && numgauss=$[$numgauss+$incgauss]; x=$[$x+1]; diff --git a/egs/wsj/s5/steps/train_map.sh b/egs/wsj/s5/steps/train_map.sh index a0b4e54bc3f..2bdf4d6cd77 100755 --- a/egs/wsj/s5/steps/train_map.sh +++ b/egs/wsj/s5/steps/train_map.sh @@ -45,6 +45,7 @@ nj=`cat $alidir/num_jobs` || exit 1; sdata=$data/split$nj splice_opts=`cat $alidir/splice_opts 2>/dev/null` # frame-splicing options. cmvn_opts=`cat $alidir/cmvn_opts 2>/dev/null` +delta_opts=`cat $alidir/delta_opts 2>/dev/null` mkdir -p $dir/log @@ -57,6 +58,7 @@ utils/ln.pl $alidir/ali.*.gz $dir echo $nj >$dir/num_jobs cp $alidir/splice_opts $dir 2>/dev/null cp $alidir/cmvn_opts $dir 2>/dev/null # cmn/cmvn option. +cp $alidir/delta_opts $dir 2>/dev/null [[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1; ## Set up features. @@ -64,7 +66,7 @@ if [ -f $alidir/final.mat ]; then feat_type=lda; else feat_type=delta; fi echo "$0: feature type is $feat_type" case $feat_type in - delta) sifeats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";; + delta) sifeats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas $delta_opts ark:- ark:- |";; lda) sifeats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $alidir/final.mat ark:- ark:- |" cp $alidir/final.mat $dir cp $alidir/full.mat $dir 2>/dev/null diff --git a/egs/wsj/s5/steps/train_mmi.sh b/egs/wsj/s5/steps/train_mmi.sh index dcee408c0d7..7ee0a135d00 100755 --- a/egs/wsj/s5/steps/train_mmi.sh +++ b/egs/wsj/s5/steps/train_mmi.sh @@ -57,9 +57,11 @@ nj=`cat $alidir/num_jobs` || exit 1; sdata=$data/split$nj splice_opts=`cat $alidir/splice_opts 2>/dev/null` cmvn_opts=`cat $alidir/cmvn_opts 2>/dev/null` +delta_opts=`cat $alidir/delta_opts 2>/dev/null` mkdir -p $dir/log cp $alidir/splice_opts $dir 2>/dev/null cp $alidir/cmvn_opts $dir 2>/dev/null # cmn/cmvn option. +cp $alidir/delta_opts $dir 2>/dev/null [[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1; echo $nj > $dir/num_jobs @@ -74,7 +76,7 @@ if [ -f $alidir/final.mat ]; then feat_type=lda; else feat_type=delta; fi echo "$0: feature type is $feat_type" case $feat_type in - delta) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";; + delta) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas $delta_opts ark:- ark:- |";; lda) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $alidir/final.mat ark:- ark:- |" cp $alidir/final.mat $dir ;; diff --git a/egs/wsj/s5/steps/train_mmi_fmmi.sh b/egs/wsj/s5/steps/train_mmi_fmmi.sh index 36130c3456b..4fd25ab13f3 100755 --- a/egs/wsj/s5/steps/train_mmi_fmmi.sh +++ b/egs/wsj/s5/steps/train_mmi_fmmi.sh @@ -76,9 +76,11 @@ nj=`cat $alidir/num_jobs` || exit 1; sdata=$data/split$nj splice_opts=`cat $alidir/splice_opts 2>/dev/null` # frame-splicing options. cmvn_opts=`cat $alidir/cmvn_opts 2>/dev/null` +delta_opts=`cat $alidir/delta_opts 2>/dev/null` mkdir -p $dir/log cp $alidir/splice_opts $dir 2>/dev/null # frame-splicing options. cp $alidir/cmvn_opts $dir 2>/dev/null # cmn/cmvn option. +cp $alidir/delta_opts $dir 2>/dev/null [[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1; @@ -87,7 +89,7 @@ echo "$0: feature type is $feat_type" # Note: $feats is the features before fMPE. case $feat_type in - delta) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";; + delta) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas $delta_opts ark:- ark:- |";; lda) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $alidir/final.mat ark:- ark:- |" cp $alidir/final.mat $dir ;; diff --git a/egs/wsj/s5/steps/train_mmi_fmmi_indirect.sh b/egs/wsj/s5/steps/train_mmi_fmmi_indirect.sh index 24670103917..42bb660cbf6 100755 --- a/egs/wsj/s5/steps/train_mmi_fmmi_indirect.sh +++ b/egs/wsj/s5/steps/train_mmi_fmmi_indirect.sh @@ -74,9 +74,11 @@ nj=`cat $alidir/num_jobs` || exit 1; sdata=$data/split$nj splice_opts=`cat $alidir/splice_opts 2>/dev/null` # frame-splicing options. cmvn_opts=`cat $alidir/cmvn_opts 2>/dev/null` +delta_opts=`cat $alidir/delta_opts 2>/dev/null` mkdir -p $dir/log cp $alidir/splice_opts $dir 2>/dev/null # frame-splicing options. cp $alidir/cmvn_opts $dir 2>/dev/null # cmn/cmvn option. +cp $alidir/delta_opts $dir 2>/dev/null [[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1; @@ -85,7 +87,7 @@ echo "$0: feature type is $feat_type" # Note: $feats is the features before fMPE. case $feat_type in - delta) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";; + delta) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas $delta_opts ark:- ark:- |";; lda) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $alidir/final.mat ark:- ark:- |" cp $alidir/final.mat $dir ;; diff --git a/egs/wsj/s5/steps/train_mono.sh b/egs/wsj/s5/steps/train_mono.sh index c03fbf4b118..9efeb9a084d 100755 --- a/egs/wsj/s5/steps/train_mono.sh +++ b/egs/wsj/s5/steps/train_mono.sh @@ -13,7 +13,7 @@ cmd=run.pl scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1" num_iters=40 # Number of iterations of training max_iter_inc=30 # Last iter to increase #Gauss on. -totgauss=1000 # Target #Gaussians. +totgauss=1000 # Target #Gaussians. careful=false boost_silence=1.0 # Factor by which to boost silence likelihoods in alignment realign_iters="1 2 3 4 5 6 7 8 9 10 12 14 16 18 20 23 26 29 32 35 38"; @@ -65,7 +65,7 @@ shared_phones_opt="--shared-phones=$lang/phones/sets.int" if [ $stage -le -3 ]; then # Note: JOB=1 just uses the 1st part of the features-- we only need a subset anyway. if ! feat_dim=`feat-to-dim "$example_feats" - 2>/dev/null` || [ -z $feat_dim ]; then - feat-to-dim "$example_feats" + feat-to-dim "$example_feats" - echo "error getting feature dimension" exit 1; fi diff --git a/egs/wsj/s5/steps/train_quick.sh b/egs/wsj/s5/steps/train_quick.sh index 38d67cdd182..b6e99334b74 100755 --- a/egs/wsj/s5/steps/train_quick.sh +++ b/egs/wsj/s5/steps/train_quick.sh @@ -59,11 +59,13 @@ nj=`cat $alidir/num_jobs` || exit 1; sdata=$data/split$nj splice_opts=`cat $alidir/splice_opts 2>/dev/null` # frame-splicing options. cmvn_opts=`cat $alidir/cmvn_opts 2>/dev/null` +delta_opts=`cat $alidir/delta_opts 2>/dev/null` mkdir -p $dir/log echo $nj >$dir/num_jobs cp $alidir/splice_opts $dir 2>/dev/null cp $alidir/cmvn_opts $dir 2>/dev/null # cmn/cmvn option. +cp $alidir/delta_opts $dir 2>/dev/null [[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1; ## Set up features. @@ -71,7 +73,7 @@ if [ -f $alidir/final.mat ]; then feat_type=lda; else feat_type=delta; fi echo "$0: feature type is $feat_type" case $feat_type in - delta) sifeats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";; + delta) sifeats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas $delta_opts ark:- ark:- |";; lda) sifeats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $alidir/final.mat ark:- ark:- |" cp $alidir/final.mat $dir cp $alidir/full.mat $dir 2>/dev/null diff --git a/egs/wsj/s5/steps/train_sat.sh b/egs/wsj/s5/steps/train_sat.sh index 4fb35b2a722..51fddd3fe4b 100755 --- a/egs/wsj/s5/steps/train_sat.sh +++ b/egs/wsj/s5/steps/train_sat.sh @@ -32,6 +32,9 @@ power=0.2 # Exponent for number of gaussians according to occurrence counts cluster_thresh=-1 # for build-tree control final bottom-up clustering of leaves phone_map= train_tree=true +tree_stats_opts= +cluster_phones_opts= +compile_questions_opts= # End configuration section. echo "$0 $@" # Print the command line for logging @@ -90,7 +93,7 @@ echo "$0: feature type is $feat_type" case $feat_type in delta) sifeats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas $delta_opts ark:- ark:- |";; lda) sifeats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $alidir/final.mat ark:- ark:- |" - cp $alidir/final.mat $dir + cp $alidir/final.mat $dir cp $alidir/full.mat $dir 2>/dev/null ;; *) echo "$0: invalid feature type $feat_type" && exit 1; @@ -101,7 +104,7 @@ if [ -f $alidir/trans.1 ]; then echo "$0: Using transforms from $alidir" feats="$sifeats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark,s,cs:$alidir/trans.JOB ark:- ark:- |" cur_trans_dir=$alidir -else +else if [ $stage -le -5 ]; then echo "$0: obtaining initial fMLLR transforms since not present in $alidir" # The next line is necessary because of $silphonelist otherwise being incorrect; would require @@ -123,7 +126,7 @@ if [ $stage -le -4 ] && $train_tree; then # Get tree stats. echo "$0: Accumulating tree stats" $cmd JOB=1:$nj $dir/log/acc_tree.JOB.log \ - acc-tree-stats $context_opts $phone_map_opt --ci-phones=$ciphonelist $alidir/final.mdl "$feats" \ + acc-tree-stats $context_opts $tree_stats_opts $phone_map_opt --ci-phones=$ciphonelist $alidir/final.mdl "$feats" \ "ark:gunzip -c $alidir/ali.JOB.gz|" $dir/JOB.treeacc || exit 1; [ "`ls $dir/*.treeacc | wc -w`" -ne "$nj" ] && echo "$0: Wrong #tree-accs" && exit 1; $cmd $dir/log/sum_tree_acc.log \ @@ -134,9 +137,9 @@ fi if [ $stage -le -3 ] && $train_tree; then echo "$0: Getting questions for tree clustering." # preparing questions, roots file... - cluster-phones $context_opts $dir/treeacc $lang/phones/sets.int $dir/questions.int 2> $dir/log/questions.log || exit 1; + cluster-phones $cluster_phones_opts $context_opts $dir/treeacc $lang/phones/sets.int $dir/questions.int 2>$dir/log/questions.log || exit 1; cat $lang/phones/extra_questions.int >> $dir/questions.int - compile-questions $context_opts $lang/topo $dir/questions.int $dir/questions.qst 2>$dir/log/compile_questions.log || exit 1; + compile-questions $context_opts $compile_questions_opts $lang/topo $dir/questions.int $dir/questions.qst 2>$dir/log/compile_questions.log || exit 1; echo "$0: Building the tree" $cmd $dir/log/build_tree.log \ @@ -212,7 +215,7 @@ while [ $x -lt $num_iters ]; do feats="$sifeats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$dir/trans.JOB ark:- ark:- |" cur_trans_dir=$dir fi - + if [ $stage -le $x ]; then $cmd JOB=1:$nj $dir/log/acc.$x.JOB.log \ gmm-acc-stats-ali $dir/$x.mdl "$feats" \ @@ -222,7 +225,7 @@ while [ $x -lt $num_iters ]; do gmm-est --power=$power --write-occs=$dir/$[$x+1].occs --mix-up=$numgauss $dir/$x.mdl \ "gmm-sum-accs - $dir/$x.*.acc |" $dir/$[$x+1].mdl || exit 1; rm $dir/$x.mdl $dir/$x.*.acc - rm $dir/$x.occs + rm $dir/$x.occs fi [ $x -le $max_iter_inc ] && numgauss=$[$numgauss+$incgauss]; x=$[$x+1]; @@ -257,7 +260,7 @@ utils/summarize_warnings.pl $dir/log echo "$0: Likelihood evolution:" for x in `seq $[$num_iters-1]`; do tail -n 30 $dir/log/acc.$x.*.log | awk '/Overall avg like/{l += $(NF-3)*$(NF-1); t += $(NF-1); } - /Overall average logdet/{d += $(NF-3)*$(NF-1); t2 += $(NF-1);} + /Overall average logdet/{d += $(NF-3)*$(NF-1); t2 += $(NF-1);} END{ d /= t2; l /= t; printf("%s ", d+l); } ' done echo diff --git a/egs/wsj/s5/steps/train_sat_basis.sh b/egs/wsj/s5/steps/train_sat_basis.sh index a709096760a..cbe14249646 100755 --- a/egs/wsj/s5/steps/train_sat_basis.sh +++ b/egs/wsj/s5/steps/train_sat_basis.sh @@ -63,10 +63,12 @@ ciphonelist=`cat $lang/phones/context_indep.csl` || exit 1; sdata=$data/split$nj; splice_opts=`cat $alidir/splice_opts 2>/dev/null` # frame-splicing options. cmvn_opts=`cat $alidir/cmvn_opts 2>/dev/null` +delta_opts=`cat $alidir/delta_opts 2>/dev/null` mkdir -p $dir/log cp $alidir/splice_opts $dir 2>/dev/null # frame-splicing options. cp $alidir/cmvn_opts $dir 2>/dev/null # cmn/cmvn option. +cp $alidir/delta_opts $dir 2>/dev/null echo $nj >$dir/num_jobs [[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1; @@ -78,7 +80,7 @@ echo "$0: feature type is $feat_type" ## Set up speaker-independent features. case $feat_type in - delta) sifeats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";; + delta) sifeats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas $delta_opts ark:- ark:- |";; lda) sifeats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $alidir/final.mat ark:- ark:- |" cp $alidir/final.mat $dir ;; diff --git a/egs/wsj/s5/steps/train_smbr.sh b/egs/wsj/s5/steps/train_smbr.sh index 1d38dc4532a..c8f9e8f7139 100755 --- a/egs/wsj/s5/steps/train_smbr.sh +++ b/egs/wsj/s5/steps/train_smbr.sh @@ -56,9 +56,11 @@ nj=`cat $alidir/num_jobs` || exit 1; sdata=$data/split$nj splice_opts=`cat $alidir/splice_opts 2>/dev/null` cmvn_opts=`cat $alidir/cmvn_opts 2>/dev/null` +delta_opts=`cat $alidir/delta_opts 2>/dev/null` mkdir -p $dir/log cp $alidir/splice_opts $dir 2>/dev/null cp $alidir/cmvn_opts $dir 2>/dev/null # cmn/cmvn option. +cp $alidir/delta_opts $dir 2>/dev/null [[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1; echo $nj > $dir/num_jobs @@ -72,7 +74,7 @@ if [ -f $alidir/final.mat ]; then feat_type=lda; else feat_type=delta; fi echo "$0: feature type is $feat_type" case $feat_type in - delta) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";; + delta) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas $delta_opts ark:- ark:- |";; lda) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $alidir/final.mat ark:- ark:- |" cp $alidir/final.mat $dir ;; diff --git a/egs/wsj/s5/steps/train_ubm.sh b/egs/wsj/s5/steps/train_ubm.sh index bc8b19cd3b6..3b483872497 100755 --- a/egs/wsj/s5/steps/train_ubm.sh +++ b/egs/wsj/s5/steps/train_ubm.sh @@ -63,13 +63,14 @@ sdata=$data/split$nj; [[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1; splice_opts=`cat $alidir/splice_opts 2>/dev/null` # frame-splicing options. cmvn_opts=`cat $alidir/cmvn_opts 2>/dev/null` +delta_opts=`cat $alidir/delta_opts 2>/dev/null` ## Set up features. if [ -f $alidir/final.mat ]; then feat_type=lda; else feat_type=delta; fi echo "$0: feature type is $feat_type" case $feat_type in - delta) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";; + delta) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas $delta_opts ark:- ark:- |";; lda) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $alidir/final.mat ark:- ark:- |" cp $alidir/final.mat $dir ;; diff --git a/egs/wsj/s5/utils/build_const_arpa_lm.sh b/egs/wsj/s5/utils/build_const_arpa_lm.sh index 41760159b75..375ffd79eb4 100755 --- a/egs/wsj/s5/utils/build_const_arpa_lm.sh +++ b/egs/wsj/s5/utils/build_const_arpa_lm.sh @@ -33,7 +33,6 @@ mkdir -p $new_lang mkdir -p $new_lang cp -r $old_lang/* $new_lang - unk=`cat $new_lang/oov.int` bos=`grep "" $new_lang/words.txt | awk '{print $2}'` eos=`grep "" $new_lang/words.txt | awk '{print $2}'` diff --git a/egs/wsj/s5/utils/combine_ali_dirs.sh b/egs/wsj/s5/utils/combine_ali_dirs.sh new file mode 100755 index 00000000000..ae05326a3ee --- /dev/null +++ b/egs/wsj/s5/utils/combine_ali_dirs.sh @@ -0,0 +1,99 @@ +#!/bin/bash +# Copyright 2016 Xiaohui Zhang Apache 2.0. + +# This srcipt operates on alignment directories, such as exp/tri4a_ali + +# Begin configuration section. +cmd=run.pl +extra_files= +num_jobs=4 +# End configuration section. +echo "$0 $@" # Print the command line for logging + +if [ -f path.sh ]; then . ./path.sh; fi +. parse_options.sh || exit 1; + +if [[ $# -lt 3 ]]; then + echo "Usage: $0 [options] ..." + echo "e.g.: $0 --num-jobs 32 data/train exp/tri3_ali_combined exp/tri3_ali_1 exp_tri3_ali_2" + echo "Options:" + echo " --extra-files # specify addtional files in 'src-ali-dir1' to copy" + echo " --num-jobs # number of jobs used to split the data directory." + echo " Note, files that don't appear in the first source dir will not be added even if they appear in later ones." + echo " Other than alignments, only files from the first src ali dir are copied." + exit 1; +fi + +data=$1; +shift; +dest=$1; +shift; +first_src=$1; + +mkdir -p $dest; +rm $dest/{ali.*.gz,num_jobs} 2>/dev/null + +export LC_ALL=C + +for dir in $*; do + if [ ! -f $dir/ali.1.gz ]; then + echo "$0: check if alignments (ali.*.gz) are present in $dir." + exit 1; + fi +done + +for dir in $*; do + for f in tree; do + diff $first_src/$f $dir/$f 1>/dev/null 2>&1 + if [ $? -ne 0 ]; then + echo "$0: Cannot combine alignment directories with different $f files." + fi + done +done + +for f in final.mdl tree cmvn_opts num_jobs $extra_files; do + if [ ! -f $first_src/$f ]; then + echo "combine_ali_dir.sh: no such file $first_src/$f" + exit 1; + fi + cp $first_src/$f $dest/ +done + +src_id=0 +temp_dir=$dest/temp +mkdir -p $temp_dir +echo "$0: dumping alignments in each source directory as single archive and index." +for dir in $*; do + src_id=$((src_id + 1)) + cur_num_jobs=$(cat $dir/num_jobs) || exit 1; + all_ids=$(seq -s, $cur_num_jobs) + $cmd $dir/log/copy_alignments.log \ + copy-int-vector "ark:gunzip -c $dir/ali.{$all_ids}.gz|" \ + ark,scp:$temp_dir/ali.$src_id.ark,$temp_dir/ali.$src_id.scp || exit 1; +done +cat $temp_dir/ali.*.scp | sort -m > $temp_dir/ali.scp || exit 1; + +echo "$0: splitting data to get reference utt2spk for individual ali.JOB.gz files." +utils/split_data.sh $data $num_jobs || exit 1; +echo $num_jobs > $dest/num_jobs || exit 1 + +echo "$0: splitting the alignments to appropriate chunks according to the reference utt2spk files." +for i in `seq 1 $num_jobs`; do + awk '{print $1}' $data/split$num_jobs/$i/utt2spk | sort > $temp_dir/utt_subset.$i + utils/filter_scp.pl $temp_dir/utt_subset.$i $temp_dir/ali.scp | \ + copy-int-vector scp:- "ark:|gzip -c >$dest/ali.$i.gz" || exit 1; +done + +echo "$0: checking the alignment files generated have at least 90% of the utterances." +for i in `seq 1 $num_jobs`; do + num_lines=` utils/filter_scp.pl $temp_dir/utt_subset.$i $temp_dir/ali.scp | wc -l` || exit 1; + num_lines_tot=`cat $temp_dir/utt_subset.$i |wc -l` || exit 1; + python -c "import sys; +percent = 100.0 * float($num_lines) / $num_lines_tot +if percent < 90 : + print ('$dest/ali.$i.gz {0}% utterances missing.'.format(percent))" || exit 1; +done +rm -r $temp_dir 2>/dev/null + +echo "Combined alignments and stored in $dest" +exit 0 diff --git a/egs/wsj/s5/utils/combine_data.sh b/egs/wsj/s5/utils/combine_data.sh index 2611a53045a..96fe99d42b3 100755 --- a/egs/wsj/s5/utils/combine_data.sh +++ b/egs/wsj/s5/utils/combine_data.sh @@ -39,9 +39,37 @@ for dir in $*; do fi done -for file in utt2spk utt2lang feats.scp text cmvn.scp segments reco2file_and_channel wav.scp spk2gender $extra_files; do +# W.r.t. utt2uniq file the script has different behavior compared to other files +# it is not compulsary for it to exist in src directories, but if it exists in +# even one it should exist in all. We will create the files where necessary +has_utt2uniq=false +for in_dir in $*; do + if [ -f $in_dir/utt2uniq ]; then + has_utt2uniq=true + break + fi +done + +if $has_utt2uniq; then + # we are going to create an utt2uniq file in the destdir + for in_dir in $*; do + if [ ! -f $in_dir/utt2uniq ]; then + # we assume that utt2uniq is a one to one mapping + cat $in_dir/utt2spk | awk '{printf("%s %s\n", $1, $1);}' + else + cat $in_dir/utt2uniq + fi + done | sort -k1 > $dest/utt2uniq + echo "$0: combined utt2uniq" +fi +# some of the old scripts might provide utt2uniq as an extrafile, so just remove it +extra_files=$(echo "$extra_files"|sed -e "s/utt2uniq//g") + +for file in utt2spk utt2lang utt2dur feats.scp text cmvn.scp segments reco2file_and_channel wav.scp spk2gender $extra_files; do if [ -f $first_src/$file ]; then + set -o pipefail ( for f in $*; do cat $f/$file; done ) | sort -k1 > $dest/$file || exit 1; + set +o pipefail echo "$0: combined $file" else echo "$0 [info]: not combining $file as it does not exist" diff --git a/egs/wsj/s5/utils/convert_slf.pl b/egs/wsj/s5/utils/convert_slf.pl index ee1941011e5..1bc6421f2da 100755 --- a/egs/wsj/s5/utils/convert_slf.pl +++ b/egs/wsj/s5/utils/convert_slf.pl @@ -115,7 +115,7 @@ $ss = scalar split(/_/, $ss); # update the end time - die "Node $s not yet visited, is lattice sorted topologically? $utt" unless exists $nodes{$s}; + die "Node $s not yet visited, is lattice sorted topologically? $utt" unless exists $nodes{$s}{t}; $time_end = $nodes{$s}{t} + $ss; if ($latest_time < $time_end) { $latest_time = $time_end; } diff --git a/egs/wsj/s5/utils/convert_slf_parallel.sh b/egs/wsj/s5/utils/convert_slf_parallel.sh index 1b4b2ef75fc..4e4ce41d236 100755 --- a/egs/wsj/s5/utils/convert_slf_parallel.sh +++ b/egs/wsj/s5/utils/convert_slf_parallel.sh @@ -33,7 +33,7 @@ dir=$3 model=$(dirname $dir)/final.mdl # assume model one level up from decoding dir. -for f in $lang/words.txt $lang/phones/word_boundary.int $model $dir/lat.1.gz; do +for f in $lang/words.txt $lang/phones/align_lexicon.int $model $dir/lat.1.gz; do [ ! -f $f ] && echo "$0: expecting file $f to exist" && exit 1; done @@ -50,7 +50,8 @@ nj=$(cat $dir/num_jobs) # convert the lattices (individually, gzipped) $cmd $parallel_opts JOB=1:$nj $dir/$dirname/log/lat_convert.JOB.log \ mkdir -p $dir/$dirname/JOB/ '&&' \ - lattice-align-words-lexicon --output-error-lats=true --output-if-empty=true $lang/phones/align_lexicon.int $model "ark:gunzip -c $dir/lat.JOB.gz |" ark,t:- \| \ + lattice-align-words-lexicon --output-error-lats=true --output-if-empty=true \ + $lang/phones/align_lexicon.int $model "ark:gunzip -c $dir/lat.JOB.gz |" ark,t:- \| \ utils/int2sym.pl -f 3 $lang/words.txt \| \ utils/convert_slf.pl $word_to_node_arg - $dir/$dirname/JOB/ || exit 1 diff --git a/egs/wsj/s5/utils/copy_data_dir.sh b/egs/wsj/s5/utils/copy_data_dir.sh index e7a4b8276b3..5e1a9cba470 100755 --- a/egs/wsj/s5/utils/copy_data_dir.sh +++ b/egs/wsj/s5/utils/copy_data_dir.sh @@ -46,7 +46,7 @@ srcdir=$1 destdir=$2 if [ ! -f $srcdir/utt2spk ]; then - echo "copy_data_dir.sh: no such file $srcdir/utt2spk" + echo "copy_data_dir.sh: no such file $srcdir/utt2spk" exit 1; fi @@ -57,6 +57,14 @@ mkdir -p $destdir cat $srcdir/utt2spk | awk -v p=$utt_prefix -v s=$utt_suffix '{printf("%s %s%s%s\n", $1, p, $1, s);}' > $destdir/utt_map cat $srcdir/spk2utt | awk -v p=$spk_prefix -v s=$spk_suffix '{printf("%s %s%s%s\n", $1, p, $1, s);}' > $destdir/spk_map +if [ ! -f $srcdir/utt2uniq ]; then + if [[ ! -z $utt_prefix || ! -z $utt_suffix ]]; then + cat $srcdir/utt2spk | awk -v p=$utt_prefix -v s=$utt_suffix '{printf("%s%s%s %s\n", p, $1, s, $1);}' > $destdir/utt2uniq + fi +else + cat $srcdir/utt2uniq | awk -v p=$utt_prefix -v s=$utt_suffix '{printf("%s%s%s %s\n", p, $1, s, $2);}' > $destdir/utt2uniq +fi + cat $srcdir/utt2spk | utils/apply_map.pl -f 1 $destdir/utt_map | \ utils/apply_map.pl -f 2 $destdir/spk_map >$destdir/utt2spk @@ -74,7 +82,7 @@ if [ -f $srcdir/segments ]; then cp $srcdir/reco2file_and_channel $destdir/ fi else # no segments->wav indexed by utt. - if [ -f $srcdir/wav.scp ]; then + if [ -f $srcdir/wav.scp ]; then utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/wav.scp >$destdir/wav.scp fi fi @@ -82,6 +90,9 @@ fi if [ -f $srcdir/text ]; then utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/text >$destdir/text fi +if [ -f $srcdir/utt2dur ]; then + utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/utt2dur >$destdir/utt2dur +fi if [ -f $srcdir/spk2gender ]; then utils/apply_map.pl -f 1 $destdir/spk_map <$srcdir/spk2gender >$destdir/spk2gender fi diff --git a/egs/wsj/s5/utils/create_data_link.pl b/egs/wsj/s5/utils/create_data_link.pl index 0fafa2e041b..eeed315e6dd 100755 --- a/egs/wsj/s5/utils/create_data_link.pl +++ b/egs/wsj/s5/utils/create_data_link.pl @@ -43,29 +43,30 @@ sub GetGCD { foo/egs.3.4.ark -> storage/4/egs.3.4.ark -Usage: utils/create_data_link.pl - e.g.: utils/create_data_link.pl foo/bar/egs.3.4.ark +Usage: utils/create_data_link.pl [ ... ] + e.g.: utils/create_data_link.pl foo/bar/egs.3.4.ark foo/bar/egs.3.5.ark + (note: the dirname, e.g. foo/bar/, must be the same in all cases). See also utils/remove_data_links.sh EOU GetOptions(); -if (@ARGV != 1) { +if (@ARGV == 0) { die $Usage; } -my $fullpath = shift(@ARGV); +my $example_fullpath = $ARGV[0]; # Check if the storage has been created. If so, do nothing. -my $dirname = dirname($fullpath); +my $dirname = dirname($example_fullpath); if (! -d "$dirname/storage") { exit(0); } # Storage exists, create symbolic links in the next few steps. -# First, get a list of the available storage direstories, and check if they are +# First, get a list of the available storage directories, and check if they are # properly created. opendir(my $dh, "$dirname/storage/") || die "$0: Fail to open $dirname/storage/\n"; my @storage_dirs = grep(/^[0-9]*$/, readdir($dh)); @@ -83,25 +84,48 @@ sub GetGCD { } } -# Finally, work out the directory index where we should put the data to. -my $basename = basename($fullpath); -my $filename_numbers = $basename; -$filename_numbers =~ s/[^0-9]+/ /g; -my @filename_numbers = split(" ", $filename_numbers); -my $total = 0; -my $index = 0; -foreach my $x (@filename_numbers) { - if ($index >= scalar(@coprimes)) { - $index = 0; +my $ret = 0; + +foreach my $fullpath (@ARGV) { + if ($dirname ne dirname($fullpath)) { + die "Mismatch in directory names of arguments: $example_fullpath versus $fullpath"; } - $total += $x * $coprimes[$index]; - $index++; -} -my $dir_index = $total % $num_storage + 1; -# Make the symbolic link. -if (-e $fullpath) { - unlink($fullpath); + # Finally, work out the directory index where we should put the data to. + my $basename = basename($fullpath); + my $filename_numbers = $basename; + $filename_numbers =~ s/[^0-9]+/ /g; + my @filename_numbers = split(" ", $filename_numbers); + my $total = 0; + my $index = 0; + foreach my $x (@filename_numbers) { + if ($index >= scalar(@coprimes)) { + $index = 0; + } + $total += $x * $coprimes[$index]; + $index++; + } + my $dir_index = $total % $num_storage + 1; + + # Make the symbolic link. + if (-e $fullpath) { + unlink($fullpath); + } + if (symlink("storage/$dir_index/$basename", $fullpath) != 1) { # failure + $ret = 1; # will exit with error status. + } } -my $ret = symlink("storage/$dir_index/$basename", $fullpath); -exit($ret == 1 ? 0 : 1); + +exit($ret); + +## testing: +# rm -rf foo bar +# mkdir -p bar/{1,2,3,4} +# mkdir -p foo/storage +# for x in 1 2 3 4; do ln -s ../../bar/$x foo/storage/$x; done +# utils/create_data_link.pl utils/create_data_link.pl foo/1.3.ark foo/2.3.ark +# ls -l foo +# total 0 +# lrwxrwxrwx 1 dpovey fax 17 Sep 2 17:41 1.3.ark -> storage/3/1.3.ark +# lrwxrwxrwx 1 dpovey fax 17 Sep 2 17:41 2.3.ark -> storage/4/2.3.ark +# drwxr-xr-x 2 dpovey fax 38 Sep 2 17:40 storage diff --git a/egs/wsj/s5/utils/create_split_dir.pl b/egs/wsj/s5/utils/create_split_dir.pl index 0c4f023f7f3..0acf53f4c2c 100755 --- a/egs/wsj/s5/utils/create_split_dir.pl +++ b/egs/wsj/s5/utils/create_split_dir.pl @@ -44,20 +44,39 @@ my $dir = pop(@ARGV); system("mkdir -p $dir 2>/dev/null"); -my $index = 1; + +my @all_actual_storage = (); foreach my $file (@ARGV) { - $file = $file . "/" . $suffix; - my $actual_storage = File::Spec->rel2abs($file); + push @all_actual_storage, File::Spec->rel2abs($file . "/" . $suffix); +} + +my $index = 1; +foreach my $actual_storage (@all_actual_storage) { my $pseudo_storage = "$dir/$index"; # If the symbolic link already exists, delete it. if (-l $pseudo_storage) { print STDERR "$0: link $pseudo_storage already exists, not overwriting.\n"; + $index++; next; } # Create the destination directory and make the link. system("mkdir -p $actual_storage 2>/dev/null"); + if ($? != 0) { + print STDERR "$0: error creating directory $actual_storage\n"; + exit(1); + } + { # create a README file for easier deletion. + open(R, ">$actual_storage/README.txt"); + my $storage_dir = File::Spec->rel2abs($dir); + print R "# This directory is linked from $storage_dir, as part of Kaldi striped data\n"; + print R "# The full list of directories where this data resides is:\n"; + foreach my $d (@all_actual_storage) { + print R "$d\n"; + } + close(R); + } my $ret = symlink($actual_storage, $pseudo_storage); # Process the returned values diff --git a/egs/wsj/s5/utils/data/combine_data.sh b/egs/wsj/s5/utils/data/combine_data.sh new file mode 120000 index 00000000000..0aed7e823b7 --- /dev/null +++ b/egs/wsj/s5/utils/data/combine_data.sh @@ -0,0 +1 @@ +../combine_data.sh \ No newline at end of file diff --git a/egs/wsj/s5/utils/data/copy_data_dir.sh b/egs/wsj/s5/utils/data/copy_data_dir.sh new file mode 120000 index 00000000000..b9854db4655 --- /dev/null +++ b/egs/wsj/s5/utils/data/copy_data_dir.sh @@ -0,0 +1 @@ +../copy_data_dir.sh \ No newline at end of file diff --git a/egs/wsj/s5/utils/data/get_frame_shift.sh b/egs/wsj/s5/utils/data/get_frame_shift.sh new file mode 100755 index 00000000000..77f5f8eb7dc --- /dev/null +++ b/egs/wsj/s5/utils/data/get_frame_shift.sh @@ -0,0 +1,54 @@ +#!/bin/bash + +# Copyright 2016 Johns Hopkins University (author: Daniel Povey) +# Apache 2.0 + +# This script takes as input a data directory, such as data/train/, preferably +# with utt2dur file already existing (or the utt2dur file will be created if +# not), and it attempts to work out the approximate frame shift by comparing the +# utt2dur with the output of feat-to-len on the feats.scp. It prints it out. +# if the shift is very close to, but above, 0.01 (the normal frame shift) it +# rounds it down. + +. utils/parse_options.sh +. ./path.sh + +if [ $# != 1 ]; then + echo "Usage: $0 " + echo "e.g.:" + echo " $0 data/train" + echo "This script prints the frame-shift (e.g. 0.01) to the standard out." + echo "If does not contain utt2dur, this script will call utils/data/get_utt2dur.sh," + echo "which will require write permission to " + exit 1 +fi + +export LC_ALL=C + +dir=$1 + +if [ ! -f $dir/utt2dur ]; then + echo "$0: $dir/utt2dur does not exist: creating it" 1>&2 + utils/data/get_utt2dur.sh $dir 1>&2 +fi + +if [ ! -f $dir/feats.scp ]; then + echo "$0: $dir/feats.scp does not exist" 1>&2 + exit 1 +fi + +temp=$(mktemp /tmp/tmp.XXXX) + +feat-to-len scp:$dir/feats.scp ark,t:- | head -n 10 > $temp + +if [ -z $temp ]; then + echo "$0: error running feat-to-len" 1>&2 + exit 1 +fi + +head -n 10 $dir/utt2dur | paste - $temp | \ + awk '{ dur += $2; frames += $4; } END { shift = dur / frames; if (shift > 0.01 && shift < 0.0102) shift = 0.01; print shift; }' || exit 1; + +rm $temp + +exit 0 diff --git a/egs/wsj/s5/utils/data/get_num_frames.sh b/egs/wsj/s5/utils/data/get_num_frames.sh new file mode 100755 index 00000000000..9c4aae5e693 --- /dev/null +++ b/egs/wsj/s5/utils/data/get_num_frames.sh @@ -0,0 +1,25 @@ +#!/bin/bash + +# This script works out the approximate number of frames in a training directory. +# This is sometimes needed by higher-level scripts + + +if [ -f path.sh ]; then . ./path.sh; fi +. parse_options.sh || exit 1; + +if [ $# -ne 1 ]; then + ( + echo "Usage: $0 " + echo "Prints the number of frames of data in the data-dir" + ) 1>&2 +fi + +data=$1 + +if [ ! -f $data/utt2dur ]; then + utils/data/get_utt2dur.sh $data 1>&2 || exit 1 +fi + +frame_shift=$(utils/data/get_frame_shift.sh $data) || exit 1 + +awk -v s=$frame_shift '{n += $2} END{print int(n / s)}' <$data/utt2dur diff --git a/egs/wsj/s5/utils/data/get_utt2dur.sh b/egs/wsj/s5/utils/data/get_utt2dur.sh new file mode 100755 index 00000000000..20e89e44ed9 --- /dev/null +++ b/egs/wsj/s5/utils/data/get_utt2dur.sh @@ -0,0 +1,101 @@ +#!/bin/bash + +# Copyright 2016 Johns Hopkins University (author: Daniel Povey) +# Apache 2.0 + +# This script operates on a data directory, such as in data/train/, and adds the +# utt2dur file if it does not already exist. The file 'utt2dur' maps from +# utterance to the duration of the utterance in seconds. This script works it +# out from the 'segments' file, or, if not present, from the wav.scp file (it +# first tries interrogating the headers, and if this fails, it reads the wave +# files in entirely.) + +frame_shift=0.01 + +. utils/parse_options.sh +. ./path.sh + +if [ $# != 1 ]; then + echo "Usage: $0 [options] " + echo "e.g.:" + echo " $0 data/train" + echo " Options:" + echo " --frame-shift # frame shift in seconds. Only relevant when we are" + echo " # getting duration from feats.scp (default: 0.01). " + exit 1 +fi + +export LC_ALL=C + +data=$1 + +if [ -f $data/utt2dur ]; then + echo "$0: $data/utt2dur file already exists. The script is not going to be executed." + exit 0; +fi + +if [ -f $data/segments ]; then + echo "$0: working out $data/utt2dur from $data/segments" + cat $data/segments | awk '{len=$4-$3; print $1, len;}' > $data/utt2dur +elif [ -f $data/wav.scp ]; then + echo "$0: segments file does not exist so getting durations from wave files" + + # if the wav.scp contains only lines of the form + # utt1 /foo/bar/sph2pipe -f wav /baz/foo.sph | + if cat $data/wav.scp | perl -e ' + while (<>) { s/\|\s*$/ |/; # make sure final | is preceded by space. + @A = split; if (!($#A == 5 && $A[1] =~ m/sph2pipe$/ && + $A[2] eq "-f" && $A[3] eq "wav" && $A[5] eq "|")) { exit(1); } + $utt = $A[0]; $sphere_file = $A[4]; + if (!open(F, "<$sphere_file")) { die "Error opening sphere file $sphere_file"; } + $sample_rate = -1; $sample_count = -1; + for ($n = 0; $n <= 30; $n++) { + $line = ; + if ($line =~ m/sample_rate -i (\d+)/) { $sample_rate = $1; } + if ($line =~ m/sample_count -i (\d+)/) { $sample_count = $1; } + if ($line =~ m/end_head/) { break; } + } + close(F); + if ($sample_rate == -1 || $sample_count == -1) { + die "could not parse sphere header from $sphere_file"; + } + $duration = $sample_count * 1.0 / $sample_rate; + print "$utt $duration\n"; + } ' > $data/utt2dur; then + echo "$0: successfully obtained utterance lengths from sphere-file headers" + else + echo "$0: could not get utterance lengths from sphere-file headers, using wav-to-duration" + if ! command -v wav-to-duration >/dev/null; then + echo "$0: wav-to-duration is not on your path" + exit 1; + fi + + read_entire_file=false + if cat $data/wav.scp | grep -q 'sox.*speed'; then + read_entire_file=true + echo "$0: reading from the entire wav file to fix the problem caused by sox commands with speed perturbation. It is going to be slow." + fi + + if ! wav-to-duration --read-entire-file=$read_entire_file scp:$data/wav.scp ark,t:$data/utt2dur 2>&1 | grep -v 'nonzero return status'; then + echo "$0: there was a problem getting the durations; moving $data/utt2dur to $data/.backup/" + mkdir -p $data/.backup/ + mv $data/utt2dur $data/.backup/ + fi + fi +elif [ -f $data/feats.scp ]; then + echo "$0: wave file does not exist so getting durations from feats files" + feat-to-len scp:$data/feats.scp ark,t:- | awk -v frame_shift=$frame_shift '{print $1, $2*frame_shift;}' >$data/utt2dur +else + echo "$0: Expected $data/wav.scp, $data/segments or $data/feats.scp to exist" + exit 1 +fi + +len1=$(cat $data/utt2spk | wc -l) +len2=$(cat $data/utt2dur | wc -l) +if [ "$len1" != "$len2" ]; then + echo "$0: warning: length of utt2dur does not equal that of utt2spk, $len2 != $len1" +fi + +echo "$0: computed $data/utt2dur" + +exit 0 diff --git a/egs/wsj/s5/utils/data/perturb_data_dir_speed.sh b/egs/wsj/s5/utils/data/perturb_data_dir_speed.sh new file mode 120000 index 00000000000..1cd5db30d92 --- /dev/null +++ b/egs/wsj/s5/utils/data/perturb_data_dir_speed.sh @@ -0,0 +1 @@ +../perturb_data_dir_speed.sh \ No newline at end of file diff --git a/egs/wsj/s5/utils/data/perturb_data_dir_speed_3way.sh b/egs/wsj/s5/utils/data/perturb_data_dir_speed_3way.sh new file mode 100755 index 00000000000..a5a030ffdd8 --- /dev/null +++ b/egs/wsj/s5/utils/data/perturb_data_dir_speed_3way.sh @@ -0,0 +1,42 @@ +#!/bin/bash + +# Copyright 2016 Johns Hopkins University (author: Daniel Povey) + +# Apache 2.0 + +# This script does the standard 3-way speed perturbing of +# a data directory (it operates on the wav.scp). + +. utils/parse_options.sh + +if [ $# != 2 ]; then + echo "Usage: perturb_data_dir_speed_3way.sh " + echo "Applies standard 3-way speed perturbation using factors of 0.9, 1.0 and 1.1." + echo "e.g.:" + echo " $0 data/train data/train_sp" + echo "Note: if /feats.scp already exists, this will refuse to run." + exit 1 +fi + +srcdir=$1 +destdir=$2 + +if [ ! -f $srcdir/wav.scp ]; then + echo "$0: expected $srcdir/wav.scp to exist" + exit 1 +fi + +if [ -f $destdir/feats.scp ]; then + echo "$0: $destdir/feats.scp already exists: refusing to run this (please delete $destdir/feats.scp if you want this to run)" + exit 1 +fi + +utils/data/perturb_data_dir_speed.sh 0.9 ${srcdir} ${destdir}_speed0.9 || exit 1 +utils/data/perturb_data_dir_speed.sh 1.1 ${srcdir} ${destdir}_speed1.1 || exit 1 +utils/data/combine_data.sh $destdir ${srcdir} ${destdir}_speed0.9 ${destdir}_speed1.1 || exit 1 + +rm -r ${destdir}_speed0.9 ${destdir}_speed1.1 + +echo "$0: generated 3-way speed-perturbed version of data in $srcdir, in $destdir" +utils/validate_data_dir.sh --no-feats $destdir + diff --git a/egs/wsj/s5/utils/data/perturb_data_dir_volume.sh b/egs/wsj/s5/utils/data/perturb_data_dir_volume.sh new file mode 100755 index 00000000000..7c58b59a846 --- /dev/null +++ b/egs/wsj/s5/utils/data/perturb_data_dir_volume.sh @@ -0,0 +1,65 @@ +#!/bin/bash + +# Copyright 2016 Johns Hopkins University (author: Daniel Povey) +# Apache 2.0 + +# This script operates on a data directory, such as in data/train/, and modifies +# the wav.scp to perturb the volume (typically useful for training data when +# using systems that don't have cepstral mean normalization). + +. utils/parse_options.sh + +if [ $# != 1 ]; then + echo "Usage: $0 " + echo "e.g.:" + echo " $0 data/train" + exit 1 +fi + +export LC_ALL=C + +data=$1 + +if [ ! -f $data/wav.scp ]; then + echo "$0: Expected $data/wav.scp to exist" + exit 1 +fi + +if grep -q "sox --vol" $data/wav.scp; then + echo "$0: It looks like the data was already volume perturbed. Not doing anything." + exit 0 +fi + +cat $data/wav.scp | python -c " +import sys, os, subprocess, re, random +random.seed(0) +scale_low = 1.0/8 +scale_high = 2.0 +for line in sys.stdin.readlines(): + if len(line.strip()) == 0: + continue + if line.strip()[-1] == '|': + print '{0} sox --vol {1} -t wav - -t wav - |'.format(line.strip(), random.uniform(scale_low, scale_high)) + else: + parts = line.split() + print '{id} sox --vol {vol} -t wav {wav} -t wav - |'.format(id = parts[0], wav=' '.join(parts[1:]), vol = random.uniform(scale_low, scale_high)) +" > $data/wav.scp_scaled || exit 1; + +len1=$(cat $data/wav.scp | wc -l) +len2=$(cat $data/wav.scp_scaled | wc -l) +if [ "$len1" != "$len2" ]; then + echo "$0: error detected: number of lines changed $len1 vs $len2"; + exit 1 +fi + +mv $data/wav.scp_scaled $data/wav.scp + +if [ -f $data/feats.scp ]; then + echo "$0: $data/feats.scp exists; moving it to $data/.backup/ as it wouldn't be valid any more." + mkdir -p $data/.backup/ + mv $data/feats.scp $data/.backup/ +fi + +echo "$0: added volume perturbation to the data in $data" +exit 0 + diff --git a/egs/wsj/s5/utils/data/validate_data_dir.sh b/egs/wsj/s5/utils/data/validate_data_dir.sh new file mode 120000 index 00000000000..1e19b4d921f --- /dev/null +++ b/egs/wsj/s5/utils/data/validate_data_dir.sh @@ -0,0 +1 @@ +../validate_data_dir.sh \ No newline at end of file diff --git a/egs/wsj/s5/utils/dict_dir_add_pronprobs.sh b/egs/wsj/s5/utils/dict_dir_add_pronprobs.sh index 5493f4b03cb..f9d2890ea39 100755 --- a/egs/wsj/s5/utils/dict_dir_add_pronprobs.sh +++ b/egs/wsj/s5/utils/dict_dir_add_pronprobs.sh @@ -75,7 +75,7 @@ cat <(awk '{print 1, $0;}' <$dir/lexicon.txt) $pron_counts | \ END{ for (p in pron_count) { word = pron2word[p]; num = pron_count[p]; den = word_count[word]; print num / den, p } } ' | \ awk '{ word = $2; $2 = $1; $1 = word; print; }' | grep -v '^' |\ - sort -k1,1 -k3 > $dir/lexiconp.txt + sort -k1,1 -k2g,2 -k3 > $dir/lexiconp.txt n_old=$(wc -l <$dir/lexicon.txt) @@ -201,7 +201,7 @@ if [ -n "$sil_counts" ]; then if ($F_nl_EOS == "0.00") { $F_nl_EOS = "0.01"; } print SP " $P_BOS_sr\n_s $F_sl_EOS\n_n $F_nl_EOS\noverall $sil_prob\n"; ' $dir/lexiconp.txt $bigram_counts $dir/lexiconp_silprob_unsorted.txt $dir/silprob.txt - sort -k1,1 -k6 $dir/lexiconp_silprob_unsorted.txt > $dir/lexiconp_silprob.txt + sort -k1,1 -k2g,2 -k6 $dir/lexiconp_silprob_unsorted.txt > $dir/lexiconp_silprob.txt fi # now regenerate lexicon.txt from lexiconp.txt, to make sure the lines are diff --git a/egs/wsj/s5/utils/filter_scps.pl b/egs/wsj/s5/utils/filter_scps.pl index 36c96a7a872..c4c283fb599 100755 --- a/egs/wsj/s5/utils/filter_scps.pl +++ b/egs/wsj/s5/utils/filter_scps.pl @@ -1,7 +1,7 @@ #!/usr/bin/env perl -# Copyright 2010-2012 Microsoft Corporation -# Johns Hopkins University (author: Daniel Povey) -# 2015 Xiaohui Zhang +# Copyright 2010-2012 Microsoft Corporation +# 2012-2016 Johns Hopkins University (author: Daniel Povey) +# 2015 Xiaohui Zhang # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -58,7 +58,7 @@ $idlist = shift @ARGV; -if (defined $jobname && $idlist !~ m/$jobname/ && +if ($idlist !~ m/$jobname/ && $jobend > $jobstart) { print STDERR "filter_scps.pl: you are trying to use multiple filter files as filter patterns but " . "you are providing just one filter file ($idlist)\n"; @@ -67,52 +67,96 @@ $infile = shift @ARGV; -open (F, "< $infile") or die "Can't open $infile for read: $!"; -my @inlines; -@inlines = ; -close(F); $outfile = shift @ARGV; -if (defined $jobname && $outfile !~ m/$jobname/ && - $jobend > $jobstart) { +if ($outfile !~ m/$jobname/ && $jobend > $jobstart) { print STDERR "filter_scps.pl: you are trying to create multiple filtered files but " . "you are providing just one output file ($outfile)\n"; exit(1); } +# This hashes from the id (e.g. utterance-id) to an array of the relevant +# job-ids (which are integers). In any normal use-case, this array will contain +# exactly one job-id for any given id, but we want to be agnostic about this. +%id2jobs = ( ); + +# Some variables that we set to produce a warning. +$warn_uncovered = 0; +$warn_multiply_covered = 0; + for ($jobid = $jobstart; $jobid <= $jobend; $jobid++) { - $outfile_n = $outfile; $idlist_n = $idlist; - if (defined $jobname) { - $idlist_n =~ s/$jobname/$jobid/g; - $outfile_n =~ s/$jobname/$jobid/g; - } + $idlist_n =~ s/$jobname/$jobid/g; open(F, "<$idlist_n") || die "Could not open id-list file $idlist_n"; - my %seen; + while() { @A = split; - @A>=1 || die "Invalid line $_ in id-list file $idlist_n"; - $seen{$A[0]} = 1; + @A >= 1 || die "Invalid line $_ in id-list file $idlist_n"; + $id = $A[0]; + if (! defined $id2jobs{$id}) { + $id2jobs{$id} = [ ]; # new anonymous array. + } + push @{$id2jobs{$id}}, $jobid; } close(F); - open(FW, ">$outfile_n") || die "Could not open output file $outfile_n"; - foreach (@inlines) { - if ($field == 1) { # Treat this as special case, since it is common. - $_ =~ m/\s*(\S+)\s*/ || die "Bad line $_, could not get first field."; - # $1 is what we filter on. - if ($seen{$1}) { - print FW $_; - } - } else { - @A = split; - @A > 0 || die "Invalid scp file line $_"; - @A >= $field || die "Invalid scp file line $_"; - if ($seen{$A[$field-1]}) { - print FW $_; +} + +# job2output hashes from the job-id, to an anonymous array containing +# a sequence of output lines. +%job2output = ( ); +for ($jobid = $jobstart; $jobid <= $jobend; $jobid++) { + $job2output{$jobid} = [ ]; # new anonymous array. +} + +open (F, "< $infile") or die "Can't open $infile for read: $!"; +while () { + if ($field == 1) { # Treat this as special case, since it is common. + $_ =~ m/\s*(\S+)\s*/ || die "Bad line $_, could not get first field."; + # $1 is what we filter on. + $id = $1; + } else { + @A = split; + @A > 0 || die "Invalid scp file line $_"; + @A >= $field || die "Invalid scp file line $_"; + $id = $A[$field-1]; + } + if ( ! defined $id2jobs{$id}) { + $warn_uncovered = 1; + } else { + @jobs = @{$id2jobs{$id}}; # this dereferences the array reference. + if (@jobs > 1) { + $warn_multiply_covered = 1; + } + foreach $job_id (@jobs) { + if (!defined $job2output{$job_id}) { + die "Likely code error"; } + push @{$job2output{$job_id}}, $_; } } +} +close(F); + +for ($jobid = $jobstart; $jobid <= $jobend; $jobid++) { + $outfile_n = $outfile; + $outfile_n =~ s/$jobname/$jobid/g; + open(FW, ">$outfile_n") || die "Could not open output file $outfile_n"; + $printed = 0; + foreach $line (@{$job2output{$jobid}}) { + print FW $line; + $printed = 1; + } + if (!printed) { + print STDERR "filter_scps.pl: warning: output to $outfile_n is empty\n"; + } close(FW); } + +if ($warn_uncovered) { + print STDERR "filter_scps.pl: warning: some input lines did not get output\n"; +} +if ($warn_multiply_covered) { + print STDERR "filter_scps.pl: warning: some input lines were output to multiple files\n"; +} diff --git a/egs/wsj/s5/utils/fix_data_dir.sh b/egs/wsj/s5/utils/fix_data_dir.sh index 2ccaa89f507..b6ce1511814 100755 --- a/egs/wsj/s5/utils/fix_data_dir.sh +++ b/egs/wsj/s5/utils/fix_data_dir.sh @@ -1,9 +1,9 @@ #!/bin/bash -# This script makes sure that only the segments present in +# This script makes sure that only the segments present in # all of "feats.scp", "wav.scp" [if present], segments [if present] # text, and utt2spk are present in any of them. -# It puts the original contents of data-dir into +# It puts the original contents of data-dir into # data-dir/.backup if [ $# != 1 ]; then @@ -35,7 +35,8 @@ function check_sorted { fi } -for x in utt2spk spk2utt feats.scp text segments wav.scp cmvn.scp vad.scp reco2file_and_channel spk2gender utt2lang; do +for x in utt2spk spk2utt feats.scp text segments wav.scp cmvn.scp vad.scp \ + reco2file_and_channel spk2gender utt2lang utt2uniq utt2dur; do if [ -f $data/$x ]; then cp $data/$x $data/.backup/$x check_sorted $data/$x @@ -61,7 +62,7 @@ function filter_file { function filter_recordings { # We call this once before the stage when we filter on utterance-id, and once # after. - + if [ -f $data/segments ]; then # We have a segments file -> we need to filter this and the file wav.scp, and # reco2file_and_utt, if it exists, to make sure they have the same list of @@ -78,7 +79,7 @@ function filter_recordings { utils/filter_scp.pl $data/wav.scp $tmpdir/recordings > $tmpdir/recordings.tmp mv $tmpdir/recordings.tmp $tmpdir/recordings - + cp $data/segments{,.tmp}; awk '{print $2, $1, $3, $4}' <$data/segments.tmp >$data/segments filter_file $tmpdir/recordings $data/segments cp $data/segments{,.tmp}; awk '{print $2, $1, $3, $4}' <$data/segments.tmp >$data/segments @@ -86,7 +87,7 @@ function filter_recordings { filter_file $tmpdir/recordings $data/wav.scp [ -f $data/reco2file_and_channel ] && filter_file $tmpdir/recordings $data/reco2file_and_channel - + fi } @@ -116,8 +117,6 @@ function filter_speakers { function filter_utts { cat $data/utt2spk | awk '{print $1}' > $tmpdir/utts -# Do a check. - ! cat $data/utt2spk | sort | cmp - $data/utt2spk && \ echo "utt2spk is not in sorted order (fix this yourself)" && exit 1; @@ -128,6 +127,10 @@ function filter_utts { ! cat $data/spk2utt | sort | cmp - $data/spk2utt && \ echo "spk2utt is not in sorted order (fix this yourself)" && exit 1; + if [ -f $data/utt2uniq ]; then + ! cat $data/utt2uniq | sort | cmp - $data/utt2uniq && \ + echo "utt2uniq is not in sorted order (fix this yourself)" && exit 1; + fi maybe_wav= [ ! -f $data/segments ] && maybe_wav=wav.scp # wav indexed by utts only if segments does not exist. @@ -151,7 +154,7 @@ function filter_utts { fi fi - for x in utt2spk feats.scp vad.scp text segments utt2lang $maybe_wav; do + for x in utt2spk utt2uniq feats.scp vad.scp text segments utt2lang utt2dur $maybe_wav; do if [ -f $data/$x ]; then cp $data/$x $data/.backup/$x if ! cmp -s $data/$x <( utils/filter_scp.pl $tmpdir/utts $data/$x ) ; then @@ -168,8 +171,6 @@ filter_utts filter_speakers filter_recordings - - utils/utt2spk_to_spk2utt.pl $data/utt2spk > $data/spk2utt echo "fix_data_dir.sh: old files are kept in $data/.backup" diff --git a/egs/wsj/s5/utils/format_lm.sh b/egs/wsj/s5/utils/format_lm.sh index 32dbc68031a..4ab1c73217e 100755 --- a/egs/wsj/s5/utils/format_lm.sh +++ b/egs/wsj/s5/utils/format_lm.sh @@ -39,20 +39,9 @@ for f in phones.txt words.txt L.fst L_disambig.fst phones/; do done lm_base=$(basename $lm '.gz') -gunzip -c $lm | utils/find_arpa_oovs.pl $out_dir/words.txt \ - > $out_dir/oovs_${lm_base}.txt - -# Removing all "illegal" combinations of and , which are supposed to -# occur only at being/end of utt. These can cause determinization failures -# of CLG [ends up being epsilon cycles]. gunzip -c $lm \ - | egrep -v ' | | ' \ - | arpa2fst - | fstprint \ - | utils/remove_oovs.pl $out_dir/oovs_${lm_base}.txt \ - | utils/eps2disambig.pl | utils/s2eps.pl \ - | fstcompile --isymbols=$out_dir/words.txt --osymbols=$out_dir/words.txt \ - --keep_isymbols=false --keep_osymbols=false \ - | fstrmepsilon | fstarcsort --sort_type=ilabel > $out_dir/G.fst + | arpa2fst --disambig-symbol=#0 \ + --read-symbol-table=$out_dir/words.txt - $out_dir/G.fst set +e fstisstochastic $out_dir/G.fst set -e @@ -66,7 +55,7 @@ set -e # this might cause determinization failure of CLG. # #0 is treated as an empty word. mkdir -p $out_dir/tmpdir.g -awk '{if(NF==1){ printf("0 0 %s %s\n", $1,$1); }} +awk '{if(NF==1){ printf("0 0 %s %s\n", $1,$1); }} END{print "0 0 #0 #0"; print "0";}' \ < "$lexicon" > $out_dir/tmpdir.g/select_empty.fst.txt diff --git a/egs/wsj/s5/utils/format_lm_sri.sh b/egs/wsj/s5/utils/format_lm_sri.sh index 7753c186045..c62a356e05f 100755 --- a/egs/wsj/s5/utils/format_lm_sri.sh +++ b/egs/wsj/s5/utils/format_lm_sri.sh @@ -71,8 +71,8 @@ if [ -z $loc ]; then export PATH=$PATH:$sdir:$sdir/.. else echo You appear to not have SRILM tools installed, either on your path, - echo or installed in $sdir. See tools/install_srilm.sh for installation - echo instructions. + echo or installed in $sdir. cd to ../../../tools and run + echo extras/install_srilm.sh. exit 1 fi fi @@ -85,30 +85,15 @@ mkdir -p $out_dir cp -r $lang_dir/* $out_dir || exit 1; lm_base=$(basename $lm '.gz') -gunzip -c $lm | utils/find_arpa_oovs.pl $out_dir/words.txt \ - > $out_dir/oovs_${lm_base}.txt || exit 1; - -# Removing all "illegal" combinations of and , which are supposed to -# occur only at being/end of utt. These can cause determinization failures -# of CLG [ends up being epsilon cycles]. -gunzip -c $lm \ - | egrep -v ' | | ' \ - | gzip -c > $tmpdir/lm.gz || exit 1; - awk '{print $1}' $out_dir/words.txt > $tmpdir/voc || exit 1; # Change the LM vocabulary to be the intersection of the current LM vocabulary -# and the set of words in the pronunciation lexicon. This also renormalizes the -# LM by recomputing the backoff weights, and remove those ngrams whose +# and the set of words in the pronunciation lexicon. This also renormalizes the +# LM by recomputing the backoff weights, and remove those ngrams whose # probabilities are lower than the backed-off estimates. -change-lm-vocab -vocab $tmpdir/voc -lm $tmpdir/lm.gz -write-lm $tmpdir/out_lm \ - $srilm_opts || exit 1; - -arpa2fst $tmpdir/out_lm | fstprint \ - | utils/eps2disambig.pl | utils/s2eps.pl \ - | fstcompile --isymbols=$out_dir/words.txt --osymbols=$out_dir/words.txt \ - --keep_isymbols=false --keep_osymbols=false \ - | fstrmepsilon | fstarcsort --sort_type=ilabel > $out_dir/G.fst || exit 1; +change-lm-vocab -vocab $tmpdir/voc -lm $lm -write-lm - $srilm_opts | \ + arpa2fst --disambig-symbol=#0 \ + --read-symbol-table=$out_dir/words.txt - $out_dir/G.fst || exit 1 fstisstochastic $out_dir/G.fst diff --git a/egs/wsj/s5/utils/gen_topo.pl b/egs/wsj/s5/utils/gen_topo.pl index 2ed33113260..b2e85a43606 100755 --- a/egs/wsj/s5/utils/gen_topo.pl +++ b/egs/wsj/s5/utils/gen_topo.pl @@ -5,7 +5,7 @@ # Generate a topology file. This allows control of the number of states in the # non-silence HMMs, and in the silence HMMs. -if(@ARGV != 4) { +if (@ARGV != 4) { print STDERR "Usage: utils/gen_topo.pl \n"; print STDERR "e.g.: utils/gen_topo.pl 3 5 4:5:6:7:8:9:10 1:2:3\n"; exit (1); @@ -13,8 +13,10 @@ ($num_nonsil_states, $num_sil_states, $nonsil_phones, $sil_phones) = @ARGV; -( $num_nonsil_states >= 1 && $num_nonsil_states <= 100 ) || die "Unexpected number of nonsilence-model states $num_nonsil_states\n"; -( $num_sil_states >= 3 && $num_sil_states <= 100 ) || die "Unexpected number of silence-model states $num_sil_states\n"; +( $num_nonsil_states >= 1 && $num_nonsil_states <= 100 ) || + die "Unexpected number of nonsilence-model states $num_nonsil_states\n"; +(( $num_sil_states == 1 || $num_sil_states >= 3) && $num_sil_states <= 100 ) || + die "Unexpected number of silence-model states $num_sil_states\n"; $nonsil_phones =~ s/:/ /g; $sil_phones =~ s/:/ /g; @@ -33,31 +35,45 @@ print " $num_nonsil_states \n"; # non-emitting final state. print "\n"; # Now silence phones. They have a different topology-- apart from the first and -# last states, it's fully connected. -$transp = 1.0 / ($num_sil_states-1); +# last states, it's fully connected, as long as you have >= 3 states. -print "\n"; -print "\n"; -print "$sil_phones\n"; -print "\n"; -print " 0 0 "; -for ($nextstate = 0; $nextstate < $num_sil_states-1; $nextstate++) { # Transitions to all but last - # emitting state. - print " $nextstate $transp "; -} -print "\n"; -for ($state = 1; $state < $num_sil_states-1; $state++) { # the central states all have transitions to - # themselves and to the last emitting state. - print " $state $state "; - for ($nextstate = 1; $nextstate < $num_sil_states; $nextstate++) { +if ($num_sil_states > 1) { + $transp = 1.0 / ($num_sil_states-1); + print "\n"; + print "\n"; + print "$sil_phones\n"; + print "\n"; + print " 0 0 "; + for ($nextstate = 0; $nextstate < $num_sil_states-1; $nextstate++) { # Transitions to all but last + # emitting state. print " $nextstate $transp "; } print "\n"; + for ($state = 1; $state < $num_sil_states-1; $state++) { # the central states all have transitions to + # themselves and to the last emitting state. + print " $state $state "; + for ($nextstate = 1; $nextstate < $num_sil_states; $nextstate++) { + print " $nextstate $transp "; + } + print "\n"; + } + # Final emitting state (non-skippable). + $state = $num_sil_states-1; + print " $state $state $state 0.75 $num_sil_states 0.25 \n"; + # Final nonemitting state: + print " $num_sil_states \n"; + print "\n"; +} else { + print "\n"; + print "\n"; + print "$sil_phones\n"; + print "\n"; + print " 0 0 "; + print " 0 0.75 "; + print " 1 0.25 "; + print "\n"; + print " $num_nonsil_states \n"; # non-emitting final state. + print "\n"; } -# Final emitting state (non-skippable). -$state = $num_sil_states-1; -print " $state $state $state 0.75 $num_sil_states 0.25 \n"; -# Final nonemitting state: -print " $num_sil_states \n"; -print "\n"; + print "\n"; diff --git a/egs/wsj/s5/utils/lang/add_lex_disambig.pl b/egs/wsj/s5/utils/lang/add_lex_disambig.pl new file mode 120000 index 00000000000..2d1d4425b49 --- /dev/null +++ b/egs/wsj/s5/utils/lang/add_lex_disambig.pl @@ -0,0 +1 @@ +../add_lex_disambig.pl \ No newline at end of file diff --git a/egs/wsj/s5/utils/lang/check_g_properties.pl b/egs/wsj/s5/utils/lang/check_g_properties.pl new file mode 100755 index 00000000000..ee0f6ddb515 --- /dev/null +++ b/egs/wsj/s5/utils/lang/check_g_properties.pl @@ -0,0 +1,89 @@ +#!/usr/bin/env perl + +use IPC::Open2; + +if (@ARGV != 1) { + print "Usage: $0 [options] \n"; + print "e.g.: $0 data/lang\n"; + exit(1); +} + +$lang = shift @ARGV; + +# This script checks that G.fst in the lang.fst directory is OK with respect +# to certain expected properties, and returns nonzero exit status if a problem was +# detected. It is called from validate_lang.pl. +# This only checks the properties of G that relate to disambiguation symbols, +# epsilons and forbidden symbols and . + +if (! -e "$lang/G.fst") { + print "$0: error: $lang/G.fst does not exist\n"; + exit(1); +} + +open(W, "<$lang/words.txt") || die "opening $lang/words.txt"; +$hash_zero = -1; +while () { + @A = split(" ", $_); + ($sym, $int) = @A; + if ($sym eq "" || $sym eq "") { $is_forbidden{$int} = 1; } + if ($sym eq "#0") { $hash_zero = $int; } +} + +if (-e "$lang/phones/wdisambig_words.int") { + open(F, "<$lang/phones/wdisambig_words.int") || die "opening $lang/phones/wdisambig_words.int"; + while () { + chop; + $is_disambig{$_} = 1; + } +} else { + $is_disambig{$hash_zero} = 1; +} + +$input_cmd = ". ./path.sh; fstprint $lang/G.fst|"; +open(G, $input_cmd) || die "running command $input_cmd"; + +$info_cmd = ". ./path.sh; fstcompile | fstinfo "; +open2(O, I, "$info_cmd") || die "running command $info_cmd"; + +$has_epsilons = 0; + +while () { + @A = split(" ", $_); + if (@A >= 4) { + if ($is_forbidden{$A[2]} || $is_forbidden{$A[3]}) { + chop; + print "$0: validating $lang: error: line $_ in G.fst contains forbidden symbol or \n"; + exit(1); + } elsif ($is_disambig{$A[2]}) { + print I $_; + if ($A[3] != 0) { + chop; + print "$0: validating $lang: error: line $_ in G.fst has disambig on input but no epsilon on output\n"; + exit(1); + } + } elsif ($A[2] == 0) { + print I $_; + $has_epsilons = 1; + } elsif ($A[2] != $A[3]) { + chop; + print "$0: validating $lang: error: line $_ in G.fst has inputs and outputs different but input is not disambig symbol.\n"; + exit(1); + } + } +} + +close(I); # tell 'fstcompile | fstinfo' pipeline that its input is done. +while () { + if (m/cyclic\s+y/) { + print "$0: validating $lang: error: G.fst has cycles containing only disambig symbols and epsilons. Would cause determinization failure\n"; + exit(1); + } +} + +if ($has_epsilons) { + print "$0: warning: validating $lang: G.fst has epsilon-input arcs. We don't expect these in most setups.\n"; +} + +print "--> $0 successfully validated $lang/G.fst\n"; +exit(0); diff --git a/egs/wsj/s5/utils/lang/prepare_lang.sh b/egs/wsj/s5/utils/lang/prepare_lang.sh new file mode 120000 index 00000000000..96b9f592e82 --- /dev/null +++ b/egs/wsj/s5/utils/lang/prepare_lang.sh @@ -0,0 +1 @@ +../prepare_lang.sh \ No newline at end of file diff --git a/egs/wsj/s5/utils/lang/validate_lang.pl b/egs/wsj/s5/utils/lang/validate_lang.pl new file mode 120000 index 00000000000..edb66bf3149 --- /dev/null +++ b/egs/wsj/s5/utils/lang/validate_lang.pl @@ -0,0 +1 @@ +../validate_lang.pl \ No newline at end of file diff --git a/egs/wsj/s5/utils/make_lexicon_fst.pl b/egs/wsj/s5/utils/make_lexicon_fst.pl index 0558ab20bca..bcf0f4df13a 100755 --- a/egs/wsj/s5/utils/make_lexicon_fst.pl +++ b/egs/wsj/s5/utils/make_lexicon_fst.pl @@ -29,7 +29,7 @@ if (@ARGV != 1 && @ARGV != 3 && @ARGV != 4) { print STDERR "Usage: make_lexicon_fst.pl [--pron-probs] lexicon.txt [silprob silphone [sil_disambig_sym]] >lexiconfst.txt -Creates a lexicon FST that transduces phones to words, and may allow optional silence. +Creates a lexicon FST that transduces phones to words, and may allow optional silence. Note: ordinarily, each line of lexicon.txt is: word phone1 phone2 ... phoneN; if the --pron-probs option is used, each line is: word pronunciation-probability phone1 phone2 ... phoneN. The probability 'prob' will typically be between zero and one, and note that it's generally helpful to normalize so the largest one @@ -42,7 +42,7 @@ $lexfn = shift @ARGV; if (@ARGV == 0) { $silprob = 0.0; -} elsif (@ARGV == 2) { +} elsif (@ARGV == 2) { ($silprob,$silphone) = @ARGV; } else { ($silprob,$silphone,$sildisambig) = @ARGV; @@ -57,19 +57,6 @@ open(L, "<$lexfn") || die "Error opening lexicon $lexfn"; -sub is_sil { - # Return true (1) if provided with a phone-sequence - # that means silence. - # @_ is the parameters of the function - # This function returns true if @_ equals ( $silphone ) - # or something of the form ( "#0", $silphone, "#1" ) - # where the "#0" and "#1" are disambiguation symbols. - return ( @_ == 1 && $_[0] eq $silphone || - (@_ == 3 && $_[1] eq $silphone && - $_[0] =~ m/^\#\d+$/ && - $_[0] =~ m/^\#\d+$/)); -} - if ( $silprob == 0.0 ) { # No optional silences: just have one (loop+final) state which is numbered zero. $loopstate = 0; $nextstate = 1; # next unallocated state. @@ -92,7 +79,7 @@ sub is_sil { $pron_cost = -log($pron_prob); } if ($pron_cost != 0.0) { $pron_cost_string = "\t$pron_cost"; } else { $pron_cost_string = ""; } - + $s = $loopstate; $word_or_eps = $w; while (@A > 0) { @@ -148,18 +135,16 @@ sub is_sil { $word_or_eps = ""; $pron_cost_string = ""; $pron_cost = 0.0; # so we only print it the 1st time. $s = $ns; + } elsif (!defined($silphone) || $p ne $silphone) { + # This is non-deterministic but relatively compact, + # and avoids epsilons. + $local_nosilcost = $nosilcost + $pron_cost; + $local_silcost = $silcost + $pron_cost; + print "$s\t$loopstate\t$p\t$word_or_eps\t$local_nosilcost\n"; + print "$s\t$silstate\t$p\t$word_or_eps\t$local_silcost\n"; } else { - if (!is_sil($p)) { - # This is non-deterministic but relatively compact, - # and avoids epsilons. - $local_nosilcost = $nosilcost + $pron_cost; - $local_silcost = $silcost + $pron_cost; - print "$s\t$loopstate\t$p\t$word_or_eps\t$local_nosilcost\n"; - print "$s\t$silstate\t$p\t$word_or_eps\t$local_silcost\n"; - } else { - # no point putting opt-sil after silence word. - print "$s\t$loopstate\t$p\t$word_or_eps$pron_cost_string\n"; - } + # no point putting opt-sil after silence word. + print "$s\t$loopstate\t$p\t$word_or_eps$pron_cost_string\n"; } } } diff --git a/egs/wsj/s5/utils/make_phone_bigram_lang.sh b/egs/wsj/s5/utils/make_phone_bigram_lang.sh index 87d1db8f3e8..548cb223632 100755 --- a/egs/wsj/s5/utils/make_phone_bigram_lang.sh +++ b/egs/wsj/s5/utils/make_phone_bigram_lang.sh @@ -11,7 +11,7 @@ # language-id. -# We might later have options here; if not, I'llr emove this. +# We might later have options here; if not, I'll emove this. echo "$0 $@" # Print the command line for logging @@ -42,11 +42,16 @@ rm -r $lang_out/phones 2>/dev/null cp -r $lang/phones/ $lang_out/ rm $lang_out/phones/word_boundary.* 2>/dev/null # these would # no longer be valid. +rm $lang_out/phones/wdisambig* 2>/dev/null # ditto this. + # List of disambig symbols will be empty: not needed, since G.fst and L.fst * G.fst # are determinizable without any. echo -n > $lang_out/phones/disambig.txt echo -n > $lang_out/phones/disambig.int echo -n > $lang_out/phones/disambig.csl +echo -n > $lang_out/phones/wdisambig.txt +echo -n > $lang_out/phones/wdisambig_phones.int +echo -n > $lang_out/phones/wdisambig_words.int # Let OOV symbol be the first phone. This is arbitrary, it's just # so that validate_lang.pl succeeds. We should never actually use @@ -81,7 +86,7 @@ ali-to-phones $alidir/final.mdl "ark:gunzip -c $alidir/ali.*.gz|" ark,t:- | \ foreach $p (@phones) { $src = $phn2state{$p}; $hist = $histcount{$p}; - $hist > 0 || die; + $hist > 0 || die; foreach $q (@phones) { $c = $count{$p,$q}; if (defined $c) { @@ -92,7 +97,7 @@ ali-to-phones $alidir/final.mdl "ark:gunzip -c $alidir/ali.*.gz|" ark,t:- | \ } $c = $count{$p,""}; if (defined $c) { - $cost = -log($c / $hist); # cost on FST arc. + $cost = -log($c / $hist); # cost on FST arc. print "$src $cost\n"; # final-prob. } } ' | fstcompile --acceptor=true | \ @@ -101,7 +106,7 @@ ali-to-phones $alidir/final.mdl "ark:gunzip -c $alidir/ali.*.gz|" ark,t:- | \ # symbols for phones and words are the same. # Neither has disambig symbols. cp $lang_out/phones.txt $lang_out/words.txt - + grep -v '' $lang_out/phones.txt | awk '{printf("0 0 %s %s\n", $2, $2);} END{print("0 0.0");}' | \ fstcompile > $lang_out/L.fst @@ -115,5 +120,4 @@ utils/sym2int.pl $lang_out/phones.txt <$lang_out/phones/align_lexicon.txt >$lang # L and L_disambig are the same. cp $lang_out/L.fst $lang_out/L_disambig.fst -utils/validate_lang.pl $lang_out || exit 1; -echo "$0: ignore warnings RE disambiguation symbols from validate_lang.pl (these are expected)" +utils/validate_lang.pl --skip-disambig-check $lang_out || exit 1; diff --git a/egs/wsj/s5/utils/mkgraph.sh b/egs/wsj/s5/utils/mkgraph.sh index f68c6f4099c..b7023538e9b 100755 --- a/egs/wsj/s5/utils/mkgraph.sh +++ b/egs/wsj/s5/utils/mkgraph.sh @@ -7,24 +7,26 @@ # all the language-model, pronunciation dictionary (lexicon), context-dependency, # and HMM structure in our model. The output is a Finite State Transducer # that has word-ids on the output, and pdf-ids on the input (these are indexes -# that resolve to Gaussian Mixture Models). +# that resolve to Gaussian Mixture Models). # See # http://kaldi.sourceforge.net/graph_recipe_test.html # (this is compiled from this repository using Doxygen, # the source for this part is in src/doc/graph_recipe_test.dox) +set -o pipefail -N=3 -P=1 tscale=1.0 loopscale=0.1 reverse=false +remove_oov=false -for x in `seq 5`; do - [ "$1" == "--mono" ] && N=1 && P=0 && shift; - [ "$1" == "--quinphone" ] && N=5 && P=2 && shift; +for x in `seq 6`; do + [ "$1" == "--mono" ] && context=mono && shift; + [ "$1" == "--left-biphone" ] && context=lbiphone && shift; + [ "$1" == "--quinphone" ] && context=quinphone && shift; [ "$1" == "--reverse" ] && reverse=true && shift; + [ "$1" == "--remove-oov" ] && remove_oov=true && shift; [ "$1" == "--transition-scale" ] && tscale=$2 && shift 2; [ "$1" == "--self-loop-scale" ] && loopscale=$2 && shift 2; done @@ -56,13 +58,23 @@ for f in $required; do [ ! -f $f ] && echo "mkgraph.sh: expected $f to exist" && exit 1; done +N=$(tree-info $tree | grep "context-width" | cut -d' ' -f2) || { echo "Error when getting context-width"; exit 1; } +P=$(tree-info $tree | grep "central-position" | cut -d' ' -f2) || { echo "Error when getting central-position"; exit 1; } +if [[ $context == mono && ($N != 1 || $P != 0) || \ + $context == lbiphone && ($N != 2 || $P != 1) || \ + $context == quinphone && ($N != 5 || $P != 2) ]]; then + echo "mkgraph.sh: mismatch between the specified context (--$context) and the one in the tree: N=$N, P=$P" + exit 1 +fi + mkdir -p $lang/tmp -# Note: [[ ]] is like [ ] but enables certain extra constructs, e.g. || in +# Note: [[ ]] is like [ ] but enables certain extra constructs, e.g. || in # place of -o if [[ ! -s $lang/tmp/LG.fst || $lang/tmp/LG.fst -ot $lang/G.fst || \ $lang/tmp/LG.fst -ot $lang/L_disambig.fst ]]; then fsttablecompose $lang/L_disambig.fst $lang/G.fst | fstdeterminizestar --use-log=true | \ - fstminimizeencoded | fstarcsort --sort_type=ilabel > $lang/tmp/LG.fst || exit 1; + fstminimizeencoded | fstpushspecial | \ + fstarcsort --sort_type=ilabel > $lang/tmp/LG.fst || exit 1; fstisstochastic $lang/tmp/LG.fst || echo "[info]: LG not stochastic." fi @@ -94,7 +106,12 @@ fi if [[ ! -s $dir/HCLGa.fst || $dir/HCLGa.fst -ot $dir/Ha.fst || \ $dir/HCLGa.fst -ot $clg ]]; then - fsttablecompose $dir/Ha.fst $clg | fstdeterminizestar --use-log=true \ + if $remove_oov; then + [ ! -f $lang/oov.int ] && \ + echo "$0: --remove-oov option: no file $lang/oov.int" && exit 1; + clg="fstrmsymbols --remove-arcs=true --apply-to-output=true $lang/oov.int $clg|" + fi + fsttablecompose $dir/Ha.fst "$clg" | fstdeterminizestar --use-log=true \ | fstrmsymbols $dir/disambig_tid.int | fstrmepslocal | \ fstminimizeencoded > $dir/HCLGa.fst || exit 1; fstisstochastic $dir/HCLGa.fst || echo "HCLGa is not stochastic" @@ -105,7 +122,7 @@ if [[ ! -s $dir/HCLG.fst || $dir/HCLG.fst -ot $dir/HCLGa.fst ]]; then $model < $dir/HCLGa.fst > $dir/HCLG.fst || exit 1; if [ $tscale == 1.0 -a $loopscale == 1.0 ]; then - # No point doing this test if transition-scale not 1, as it is bound to fail. + # No point doing this test if transition-scale not 1, as it is bound to fail. fstisstochastic $dir/HCLG.fst || echo "[info]: final HCLG is not stochastic." fi fi diff --git a/egs/wsj/s5/utils/pbs.pl b/egs/wsj/s5/utils/pbs.pl new file mode 100755 index 00000000000..6c8d4488882 --- /dev/null +++ b/egs/wsj/s5/utils/pbs.pl @@ -0,0 +1,587 @@ +#!/usr/bin/env perl +use strict; +use warnings; + +# Copyright 2012 Johns Hopkins University (Author: Daniel Povey). +# 2014 Johns Hopkins University (Author: Vimal Manohar) +# 2015 Queensland University of Technology (Author: Ahilan Kanagasundaram ) +# Apache 2.0. + +use File::Basename; +use Cwd; +use Getopt::Long; + +# This is a version of the queue.pl modified so that it works under PBS +# The PBS is one of the several "almost compatible" queueing systems. The +# command switches and environment variables are different, so we are adding +# a this script. An optimal solution might probably be to make the variable +# names and the commands configurable, as similar problems can be expected +# with Torque, Univa... and who knows what else +# +# queue.pl has the same functionality as run.pl, except that +# it runs the job in question on the queue (Sun GridEngine). +# This version of queue.pl uses the task array functionality +# of the grid engine. Note: it's different from the queue.pl +# in the s4 and earlier scripts. + +# The script now supports configuring the queue system using a config file +# (default in conf/pbs.conf; but can be passed specified with --config option) +# and a set of command line options. +# The current script handles: +# 1) Normal configuration arguments +# For e.g. a command line option of "--gpu 1" could be converted into the option +# "-q g.q -l gpu=1" to qsub. How the CLI option is handled is determined by a +# line in the config file like +# gpu=* -q g.q -l gpu=$0 +# $0 here in the line is replaced with the argument read from the CLI and the +# resulting string is passed to qsub. +# 2) Special arguments to options such as +# gpu=0 +# If --gpu 0 is given in the command line, then no special "-q" is given. +# 3) Default argument +# default gpu=0 +# If --gpu option is not passed in the command line, then the script behaves as +# if --gpu 0 was passed since 0 is specified as the default argument for that +# option +# 4) Arbitrary options and arguments. +# Any command line option starting with '--' and its argument would be handled +# as long as its defined in the config file. +# 5) Default behavior +# If the config file that is passed using is not readable, then the script +# behaves as if the queue has the following config file: +# $ cat conf/pbs.conf +# # Default configuration +# command qsub -v PATH -S /bin/bash -l arch=*64* +# option mem=* -l mem_free=$0,ram_free=$0 +# option mem=0 # Do not add anything to qsub_opts +# option num_threads=* -pe smp $0 +# option num_threads=1 # Do not add anything to qsub_opts +# option max_jobs_run=* -tc $0 +# default gpu=0 +# option gpu=0 -q all.q +# option gpu=* -l gpu=$0 -q g.q + +my $qsub_opts = ""; +my $sync = 0; +my $num_threads = 1; +my $gpu = 0; + +my $config = "conf/pbs.conf"; + +my %cli_options = (); + +my $jobname; +my $jobstart; +my $jobend; + +my $array_job = 0; + +sub print_usage() { + print STDERR + "Usage: queue.pl [options] [JOB=1:n] log-file command-line arguments...\n" . + "e.g.: queue.pl foo.log echo baz\n" . + " (which will echo \"baz\", with stdout and stderr directed to foo.log)\n" . + "or: queue.pl -q all.q\@xyz foo.log echo bar \| sed s/bar/baz/ \n" . + " (which is an example of using a pipe; you can provide other escaped bash constructs)\n" . + "or: queue.pl -q all.q\@qyz JOB=1:10 foo.JOB.log echo JOB \n" . + " (which illustrates the mechanism to submit parallel jobs; note, you can use \n" . + " another string other than JOB)\n" . + "Note: if you pass the \"-sync y\" option to qsub, this script will take note\n" . + "and change its behavior. Otherwise it uses qstat to work out when the job finished\n" . + "Options:\n" . + " --config (default: $config)\n" . + " --mem (e.g. --mem 2G, --mem 500M, \n" . + " also support K and numbers mean bytes)\n" . + " --num-threads (default: $num_threads)\n" . + " --max-jobs-run \n" . + " --gpu <0|1> (default: $gpu)\n"; + exit 1; +} + +if (@ARGV < 2) { + print_usage(); +} + +for (my $x = 1; $x <= 2; $x++) { # This for-loop is to + # allow the JOB=1:n option to be interleaved with the + # options to qsub. + while (@ARGV >= 2 && $ARGV[0] =~ m:^-:) { + my $switch = shift @ARGV; + + if ($switch eq "-V") { + $qsub_opts .= "-V "; + } else { + my $argument = shift @ARGV; + if ($argument =~ m/^--/) { + print STDERR "queue.pl: Warning: suspicious argument '$argument' to $switch; starts with '-'\n"; + } + if ($switch eq "-sync" && $argument =~ m/^[yY]/) { + $sync = 1; + $qsub_opts .= "$switch $argument "; + } elsif ($switch eq "-pe") { # e.g. -pe smp 5 + my $argument2 = shift @ARGV; + $qsub_opts .= "$switch $argument $argument2 "; + $num_threads = $argument2; + } elsif ($switch =~ m/^--/) { # Config options + # Convert CLI option to variable name + # by removing '--' from the switch and replacing any + # '-' with a '_' + $switch =~ s/^--//; + $switch =~ s/-/_/g; + $cli_options{$switch} = $argument; + } else { # Other qsub options - passed as is + $qsub_opts .= "$switch $argument "; + } + } + } + if ($ARGV[0] =~ m/^([\w_][\w\d_]*)+=(\d+):(\d+)$/) { # e.g. JOB=1:20 + $array_job = 1; + $jobname = $1; + $jobstart = $2; + $jobend = $3; + shift; + if ($jobstart > $jobend) { + die "queue.pl: invalid job range $ARGV[0]"; + } + if ($jobstart <= 0) { + die "run.pl: invalid job range $ARGV[0], start must be strictly positive (this is a GridEngine limitation)."; + } + } elsif ($ARGV[0] =~ m/^([\w_][\w\d_]*)+=(\d+)$/) { # e.g. JOB=1. + $array_job = 1; + $jobname = $1; + $jobstart = $2; + $jobend = $2; + shift; + } elsif ($ARGV[0] =~ m/.+\=.*\:.*$/) { + print STDERR "queue.pl: Warning: suspicious first argument to queue.pl: $ARGV[0]\n"; + } +} + +if (@ARGV < 2) { + print_usage(); +} + +if (exists $cli_options{"config"}) { + $config = $cli_options{"config"}; +} + +my $default_config_file = <<'EOF'; +# Default configuration +command qsub -V -v PATH -S /bin/bash -l mem=4G +option mem=* -l mem=$0 +option mem=0 # Do not add anything to qsub_opts +option num_threads=* -l ncpus=$0 +option num_threads=1 # Do not add anything to qsub_opts +default gpu=0 +option gpu=0 +option gpu=* -l ncpus=$0 +EOF + +# Here the configuration options specified by the user on the command line +# (e.g. --mem 2G) are converted to options to the qsub system as defined in +# the config file. (e.g. if the config file has the line +# "option mem=* -l ram_free=$0,mem_free=$0" +# and the user has specified '--mem 2G' on the command line, the options +# passed to queue system would be "-l ram_free=2G,mem_free=2G +# A more detailed description of the ways the options would be handled is at +# the top of this file. + +my $opened_config_file = 1; + +open CONFIG, "<$config" or $opened_config_file = 0; + +my %cli_config_options = (); +my %cli_default_options = (); + +if ($opened_config_file == 0 && exists($cli_options{"config"})) { + print STDERR "Could not open config file $config\n"; + exit(1); +} elsif ($opened_config_file == 0 && !exists($cli_options{"config"})) { + # Open the default config file instead + open (CONFIG, "echo '$default_config_file' |") or die "Unable to open pipe\n"; + $config = "Default config"; +} + +my $qsub_cmd = ""; +my $read_command = 0; + +while() { + chomp; + my $line = $_; + $_ =~ s/\s*#.*//g; + if ($_ eq "") { next; } + if ($_ =~ /^command (.+)/) { + $read_command = 1; + $qsub_cmd = $1 . " "; + } elsif ($_ =~ m/^option ([^=]+)=\* (.+)$/) { + # Config option that needs replacement with parameter value read from CLI + # e.g.: option mem=* -l mem_free=$0,ram_free=$0 + my $option = $1; # mem + my $arg= $2; # -l mem_free=$0,ram_free=$0 + if ($arg !~ m:\$0:) { + die "Unable to parse line '$line' in config file ($config)\n"; + } + if (exists $cli_options{$option}) { + # Replace $0 with the argument read from command line. + # e.g. "-l mem_free=$0,ram_free=$0" -> "-l mem_free=2G,ram_free=2G" + $arg =~ s/\$0/$cli_options{$option}/g; + $cli_config_options{$option} = $arg; + } + } elsif ($_ =~ m/^option ([^=]+)=(\S+)\s?(.*)$/) { + # Config option that does not need replacement + # e.g. option gpu=0 -q all.q + my $option = $1; # gpu + my $value = $2; # 0 + my $arg = $3; # -q all.q + if (exists $cli_options{$option}) { + $cli_default_options{($option,$value)} = $arg; + } + } elsif ($_ =~ m/^default (\S+)=(\S+)/) { + # Default options. Used for setting default values to options i.e. when + # the user does not specify the option on the command line + # e.g. default gpu=0 + my $option = $1; # gpu + my $value = $2; # 0 + if (!exists $cli_options{$option}) { + # If the user has specified this option on the command line, then we + # don't have to do anything + $cli_options{$option} = $value; + } + } else { + print STDERR "queue.pl: unable to parse line '$line' in config file ($config)\n"; + exit(1); + } +} + +close(CONFIG); + +if ($read_command != 1) { + print STDERR "queue.pl: config file ($config) does not contain the line \"command .*\"\n"; + exit(1); +} + +for my $option (keys %cli_options) { + if ($option eq "config") { next; } + if ($option eq "max_jobs_run" && $array_job != 1) { next; } + my $value = $cli_options{$option}; + + if (exists $cli_default_options{($option,$value)}) { + $qsub_opts .= "$cli_default_options{($option,$value)} "; + } elsif (exists $cli_config_options{$option}) { + $qsub_opts .= "$cli_config_options{$option} "; + } else { + if ($opened_config_file == 0) { $config = "default config file"; } + die "queue.pl: Command line option $option not described in $config (or value '$value' not allowed)\n"; + } +} + +my $cwd = getcwd(); +my $logfile = shift @ARGV; + +if ($array_job == 1 && $logfile !~ m/$jobname/ + && $jobend > $jobstart) { + print STDERR "queue.pl: you are trying to run a parallel job but " + . "you are putting the output into just one log file ($logfile)\n"; + exit(1); +} + +# +# Work out the command; quote escaping is done here. +# Note: the rules for escaping stuff are worked out pretty +# arbitrarily, based on what we want it to do. Some things that +# we pass as arguments to queue.pl, such as "|", we want to be +# interpreted by bash, so we don't escape them. Other things, +# such as archive specifiers like 'ark:gunzip -c foo.gz|', we want +# to be passed, in quotes, to the Kaldi program. Our heuristic +# is that stuff with spaces in should be quoted. This doesn't +# always work. +# +my $cmd = ""; + +foreach my $x (@ARGV) { + if ($x =~ m/^\S+$/) { $cmd .= $x . " "; } # If string contains no spaces, take + # as-is. + elsif ($x =~ m:\":) { $cmd .= "'$x' "; } # else if no dbl-quotes, use single + else { $cmd .= "\"$x\" "; } # else use double. +} + +# +# Work out the location of the script file, and open it for writing. +# +my $dir = dirname($logfile); +my $base = basename($logfile); +my $qdir = "$dir/q"; +$qdir =~ s:/(log|LOG)/*q:/q:; # If qdir ends in .../log/q, make it just .../q. +my $queue_logfile = "$qdir/$base"; + +if (!-d $dir) { system "mkdir -p $dir 2>/dev/null"; } # another job may be doing this... +if (!-d $dir) { die "Cannot make the directory $dir\n"; } +# make a directory called "q", +# where we will put the log created by qsub... normally this doesn't contain +# anything interesting, evertyhing goes to $logfile. +if (! -d "$qdir") { + system "mkdir $qdir 2>/dev/null"; + sleep(5); ## This is to fix an issue we encountered in denominator lattice creation, + ## where if e.g. the exp/tri2b_denlats/log/15/q directory had just been + ## created and the job immediately ran, it would die with an error because nfs + ## had not yet synced. I'm also decreasing the acdirmin and acdirmax in our + ## NFS settings to something like 5 seconds. +} + +my $queue_array_opt = ""; +if ($array_job == 1) { # It's an array job. + $queue_array_opt = "-J $jobstart-$jobend"; + $logfile =~ s/$jobname/\$PBS_ARRAY_INDEX/g; # This variable will get + # replaced by qsub, in each job, with the job-id. + $cmd =~ s/$jobname/\$\{PBS_ARRAY_INDEX\}/g; # same for the command... + $queue_logfile =~ s/\.?$jobname//; # the log file in the q/ subdirectory + # is for the queue to put its log, and this doesn't need the task array subscript + # so we remove it. +} + +# queue_scriptfile is as $queue_logfile [e.g. dir/q/foo.log] but +# with the suffix .sh. +my $queue_scriptfile = $queue_logfile; +($queue_scriptfile =~ s/\.[a-zA-Z]{1,5}$/.sh/) || ($queue_scriptfile .= ".sh"); +if ($queue_scriptfile !~ m:^/:) { + $queue_scriptfile = $cwd . "/" . $queue_scriptfile; # just in case. +} + +# We'll write to the standard input of "qsub" (the file-handle Q), +# the job that we want it to execute. +# Also keep our current PATH around, just in case there was something +# in it that we need (although we also source ./path.sh) + +my $syncfile = "$qdir/done.$$"; + +system("rm $queue_logfile $syncfile 2>/dev/null"); +# +# Write to the script file, and then close it. +# +open(Q, ">$queue_scriptfile") || die "Failed to write to $queue_scriptfile"; + +print Q "#!/bin/bash\n"; +print Q "cd $cwd\n"; +print Q ". ./path.sh\n"; +print Q "( echo '#' Running on \`hostname\`\n"; +print Q " echo '#' Started at \`date\`\n"; +print Q " echo -n '# '; cat <$logfile\n"; +print Q "time1=\`date +\"%s\"\`\n"; +print Q " ( $cmd ) 2>>$logfile >>$logfile\n"; +print Q "ret=\$?\n"; +print Q "time2=\`date +\"%s\"\`\n"; +print Q "echo '#' Accounting: time=\$((\$time2-\$time1)) threads=$num_threads >>$logfile\n"; +print Q "echo '#' Finished at \`date\` with status \$ret >>$logfile\n"; +print Q "[ \$ret -eq 137 ] && exit 100;\n"; # If process was killed (e.g. oom) it will exit with status 137; + # let the script return with status 100 which will put it to E state; more easily rerunnable. +if ($array_job == 0) { # not an array job + print Q "touch $syncfile\n"; # so we know it's done. +} else { + print Q "touch $syncfile.\$PBS_ARRAY_INDEX\n"; # touch a bunch of sync-files. +} +print Q "exit \$[\$ret ? 1 : 0]\n"; # avoid status 100 which grid-engine +print Q "## submitted with:\n"; # treats specially. +$qsub_cmd .= "-o $queue_logfile $qsub_opts $queue_array_opt $queue_scriptfile >>$queue_logfile 2>&1"; +print Q "# $qsub_cmd\n"; +if (!close(Q)) { # close was not successful... || die "Could not close script file $shfile"; + die "Failed to close the script file (full disk?)"; +} + +my $ret = system ($qsub_cmd); +if ($ret != 0) { + if ($sync && $ret == 256) { # this is the exit status when a job failed (bad exit status) + if (defined $jobname) { $logfile =~ s/\$PBS_ARRAY_INDEX/*/g; } + print STDERR "queue.pl: job writing to $logfile failed\n"; + } else { + print STDERR "queue.pl: error submitting jobs to queue (return status was $ret)\n"; + print STDERR "queue log file is $queue_logfile, command was $qsub_cmd\n"; + print STDERR `tail $queue_logfile`; + } + exit(1); +} + +my $sge_job_id; +if (! $sync) { # We're not submitting with -sync y, so we + # need to wait for the jobs to finish. We wait for the + # sync-files we "touched" in the script to exist. + my @syncfiles = (); + if (!defined $jobname) { # not an array job. + push @syncfiles, $syncfile; + } else { + for (my $jobid = $jobstart; $jobid <= $jobend; $jobid++) { + push @syncfiles, "$syncfile.$jobid"; + } + } + # We will need the sge_job_id, to check that job still exists + { # Get the SGE job-id from the log file in q/ + open(L, "<$queue_logfile") || die "Error opening log file $queue_logfile"; + undef $sge_job_id; + while () { + if (m/Your job\S* (\d+)[. ].+ has been submitted/) { + if (defined $sge_job_id) { + die "Error: your job was submitted more than once (see $queue_logfile)"; + } else { + $sge_job_id = $1; + } + } + } + close(L); + if (!defined $sge_job_id) { + die "Error: log file $queue_logfile does not specify the SGE job-id."; + } + } + my $check_sge_job_ctr=1; + # + my $wait = 0.1; + my $counter = 0; + foreach my $f (@syncfiles) { + # wait for them to finish one by one. + while (! -f $f) { + sleep($wait); + $wait *= 1.2; + if ($wait > 3.0) { + $wait = 3.0; # never wait more than 3 seconds. + # the following (.kick) commands are basically workarounds for NFS bugs. + if (rand() < 0.25) { # don't do this every time... + if (rand() > 0.5) { + system("touch $qdir/.kick"); + } else { + system("rm $qdir/.kick 2>/dev/null"); + } + } + if ($counter++ % 10 == 0) { + # This seems to kick NFS in the teeth to cause it to refresh the + # directory. I've seen cases where it would indefinitely fail to get + # updated, even though the file exists on the server. + # Only do this every 10 waits (every 30 seconds) though, or if there + # are many jobs waiting they can overwhelm the file server. + system("ls $qdir >/dev/null"); + } + } + + # Check that the job exists in SGE. Job can be killed if duration + # exceeds some hard limit, or in case of a machine shutdown. + if (($check_sge_job_ctr++ % 10) == 0) { # Don't run qstat too often, avoid stress on SGE. + if ( -f $f ) { next; }; #syncfile appeared: OK. + $ret = system("qstat -t $sge_job_id >/dev/null 2>/dev/null"); + # system(...) : To get the actual exit value, shift $ret right by eight bits. + if ($ret>>8 == 1) { # Job does not seem to exist + # Don't consider immediately missing job as error, first wait some + # time to make sure it is not just delayed creation of the syncfile. + + sleep(3); + # Sometimes NFS gets confused and thinks it's transmitted the directory + # but it hasn't, due to timestamp issues. Changing something in the + # directory will usually fix that. + system("touch $qdir/.kick"); + system("rm $qdir/.kick 2>/dev/null"); + if ( -f $f ) { next; } #syncfile appeared, ok + sleep(7); + system("touch $qdir/.kick"); + sleep(1); + system("rm $qdir/.kick 2>/dev/null"); + if ( -f $f ) { next; } #syncfile appeared, ok + sleep(60); + system("touch $qdir/.kick"); + sleep(1); + system("rm $qdir/.kick 2>/dev/null"); + if ( -f $f ) { next; } #syncfile appeared, ok + $f =~ m/\.(\d+)$/ || die "Bad sync-file name $f"; + my $job_id = $1; + if (defined $jobname) { + $logfile =~ s/\$PBS_ARRAY_INDEX/$job_id/g; + } + my $last_line = `tail -n 1 $logfile`; + if ($last_line =~ m/status 0$/ && (-M $logfile) < 0) { + # if the last line of $logfile ended with "status 0" and + # $logfile is newer than this program [(-M $logfile) gives the + # time elapsed between file modification and the start of this + # program], then we assume the program really finished OK, + # and maybe something is up with the file system. + print STDERR "**queue.pl: syncfile $f was not created but job seems\n" . + "**to have finished OK. Probably your file-system has problems.\n" . + "**This is just a warning.\n"; + last; + } else { + chop $last_line; + print STDERR "queue.pl: Error, unfinished job no " . + "longer exists, log is in $logfile, last line is '$last_line', " . + "syncfile is $f, return status of qstat was $ret\n" . + "Possible reasons: a) Exceeded time limit? -> Use more jobs!" . + " b) Shutdown/Frozen machine? -> Run again!\n"; + exit(1); + } + } elsif ($ret != 0) { + print STDERR "queue.pl: Warning: qstat command returned status $ret (qstat -t $sge_job_id,$!)\n"; + } + } + } + } + my $all_syncfiles = join(" ", @syncfiles); + system("rm $all_syncfiles 2>/dev/null"); +} + +# OK, at this point we are synced; we know the job is done. +# But we don't know about its exit status. We'll look at $logfile for this. +# First work out an array @logfiles of file-locations we need to +# read (just one, unless it's an array job). +my @logfiles = (); +if (!defined $jobname) { # not an array job. + push @logfiles, $logfile; +} else { + for (my $jobid = $jobstart; $jobid <= $jobend; $jobid++) { + my $l = $logfile; + $l =~ s/\$PBS_ARRAY_INDEX/$jobid/g; + push @logfiles, $l; + } +} + +my $num_failed = 0; +my $status = 1; +foreach my $l (@logfiles) { + my @wait_times = (0.1, 0.2, 0.2, 0.3, 0.5, 0.5, 1.0, 2.0, 5.0, 5.0, 5.0, 10.0, 25.0); + for (my $iter = 0; $iter <= @wait_times; $iter++) { + my $line = `tail -10 $l 2>/dev/null`; # Note: although this line should be the last + # line of the file, I've seen cases where it was not quite the last line because + # of delayed output by the process that was running, or processes it had called. + # so tail -10 gives it a little leeway. + if ($line =~ m/with status (\d+)/) { + $status = $1; + last; + } else { + if ($iter < @wait_times) { + sleep($wait_times[$iter]); + } else { + if (! -f $l) { + print STDERR "Log-file $l does not exist.\n"; + } else { + print STDERR "The last line of log-file $l does not seem to indicate the " + . "return status as expected\n"; + } + exit(1); # Something went wrong with the queue, or the + # machine it was running on, probably. + } + } + } + # OK, now we have $status, which is the return-status of + # the command in the job. + if ($status != 0) { $num_failed++; } +} +if ($num_failed == 0) { exit(0); } +else { # we failed. + if (@logfiles == 1) { + if (defined $jobname) { $logfile =~ s/\$PBS_ARRAY_INDEX/$jobstart/g; } + print STDERR "queue.pl: job failed with status $status, log is in $logfile\n"; + if ($logfile =~ m/JOB/) { + print STDERR "queue.pl: probably you forgot to put JOB=1:\$nj in your script.\n"; + } + } else { + if (defined $jobname) { $logfile =~ s/\$PBS_ARRAY_INDEX/*/g; } + my $numjobs = 1 + $jobend - $jobstart; + print STDERR "queue.pl: $num_failed / $numjobs failed, log is in $logfile\n"; + } + exit(1); +} diff --git a/egs/wsj/s5/utils/perturb_data_dir_speed.sh b/egs/wsj/s5/utils/perturb_data_dir_speed.sh index 61c0962cf15..5de8b994705 100755 --- a/egs/wsj/s5/utils/perturb_data_dir_speed.sh +++ b/egs/wsj/s5/utils/perturb_data_dir_speed.sh @@ -1,4 +1,4 @@ -#!/bin/bash +#!/bin/bash # Copyright 2013 Johns Hopkins University (author: Daniel Povey) # 2014 Tom Ko @@ -36,7 +36,7 @@ which sox &>/dev/null ! [ $? -eq 0 ] && echo "sox: command not found" && exit 1; if [ ! -f $srcdir/utt2spk ]; then - echo "$0: no such file $srcdir/utt2spk" + echo "$0: no such file $srcdir/utt2spk" exit 1; fi @@ -65,18 +65,18 @@ if [ -f $srcdir/segments ]; then utils/apply_map.pl -f 1 $destdir/reco_map <$srcdir/wav.scp | sed 's/| *$/ |/' | \ awk -v factor=$factor \ - '{wid=$1; $1=""; if ($NF=="|") {print wid $_ " sox -t wav - -t wav - speed " factor " |"} + '{wid=$1; $1=""; if ($NF=="|") {print wid $_ " sox -t wav - -t wav - speed " factor " |"} else {print wid " sox -t wav" $_ " -t wav - speed " factor " |"}}' > $destdir/wav.scp if [ -f $srcdir/reco2file_and_channel ]; then utils/apply_map.pl -f 1 $destdir/reco_map <$srcdir/reco2file_and_channel >$destdir/reco2file_and_channel fi - + rm $destdir/reco_map 2>/dev/null else # no segments->wav indexed by utterance. if [ -f $srcdir/wav.scp ]; then utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/wav.scp | sed 's/| *$/ |/' | \ awk -v factor=$factor \ - '{wid=$1; $1=""; if ($NF=="|") {print wid $_ " sox -t wav - -t wav - speed " factor " |"} + '{wid=$1; $1=""; if ($NF=="|") {print wid $_ " sox -t wav - -t wav - speed " factor " |"} else {print wid " sox -t wav" $_ " -t wav - speed " factor " |"}}' > $destdir/wav.scp fi fi @@ -88,6 +88,13 @@ if [ -f $srcdir/spk2gender ]; then utils/apply_map.pl -f 1 $destdir/spk_map <$srcdir/spk2gender >$destdir/spk2gender fi +if [ ! -f $srcdir/utt2dur ]; then + # generate utt2dur if it does not exist in srcdir + utils/data/get_utt2dur.sh $srcdir +fi + +cat $srcdir/utt2dur | utils/apply_map.pl -f 1 $destdir/utt_map | \ + awk -v factor=$factor '{print $1, $2/factor;}' >$destdir/utt2dur rm $destdir/spk_map $destdir/utt_map 2>/dev/null echo "$0: generated speed-perturbed version of data in $srcdir, in $destdir" diff --git a/egs/wsj/s5/utils/prepare_lang.sh b/egs/wsj/s5/utils/prepare_lang.sh index 7701a956235..0014f22a04e 100755 --- a/egs/wsj/s5/utils/prepare_lang.sh +++ b/egs/wsj/s5/utils/prepare_lang.sh @@ -28,20 +28,21 @@ # and also files silence_phones.txt, nonsilence_phones.txt, optional_silence.txt # and extra_questions.txt # Here, silence_phones.txt and nonsilence_phones.txt are lists of silence and -# non-silence phones respectively (where silence includes various kinds of -# noise, laugh, cough, filled pauses etc., and nonsilence phones includes the +# non-silence phones respectively (where silence includes various kinds of +# noise, laugh, cough, filled pauses etc., and nonsilence phones includes the # "real" phones.) -# In each line of those files is a list of phones, and the phones on each line -# are assumed to correspond to the same "base phone", i.e. they will be +# In each line of those files is a list of phones, and the phones on each line +# are assumed to correspond to the same "base phone", i.e. they will be # different stress or tone variations of the same basic phone. -# The file "optional_silence.txt" contains just a single phone (typically SIL) +# The file "optional_silence.txt" contains just a single phone (typically SIL) # which is used for optional silence in the lexicon. # extra_questions.txt might be empty; typically will consist of lists of phones, -# all members of each list with the same stress or tone; and also possibly a -# list for the silence phones. This will augment the automtically generated -# questions (note: the automatically generated ones will treat all the -# stress/tone versions of a phone the same, so will not "get to ask" about +# all members of each list with the same stress or tone; and also possibly a +# list for the silence phones. This will augment the automatically generated +# questions (note: the automatically generated ones will treat all the +# stress/tone versions of a phone the same, so will not "get to ask" about # stress or tone). +# # This script adds word-position-dependent phones and constructs a host of other # derived files, that go in data/lang/. @@ -49,19 +50,20 @@ # Begin configuration section. num_sil_states=5 num_nonsil_states=3 +num_word_disambig_syms=1 position_dependent_phones=true -# position_dependent_phones is false also when position dependent phones and word_boundary.txt +# position_dependent_phones is false also when position dependent phones and word_boundary.txt # have been generated by another source reverse=false -share_silence_phones=false # if true, then share pdfs of different silence +share_silence_phones=false # if true, then share pdfs of different silence # phones together. sil_prob=0.5 phone_symbol_table= # if set, use a specified phones.txt file. # end configuration sections -. utils/parse_options.sh +. utils/parse_options.sh -if [ $# -ne 4 ]; then +if [ $# -ne 4 ]; then echo "usage: utils/prepare_lang.sh " echo "e.g.: utils/prepare_lang.sh data/local/dict data/local/lang data/lang" echo " should contain the following files:" @@ -114,8 +116,8 @@ fi # phones.txt file provided, we will do some sanity check here. if [[ ! -z $phone_symbol_table ]]; then # Checks if we have position dependent phones - n1=`cat $phone_symbol_table | grep -v -P "^#[0-9]+$" | cut -d' ' -f1 | sort -u | wc -l` - n2=`cat $phone_symbol_table | grep -v -P "^#[0-9]+$" | cut -d' ' -f1 | sed 's/_[BIES]$//g' | sort -u | wc -l` + n1=`cat $phone_symbol_table | grep -v -E "^#[0-9]+$" | cut -d' ' -f1 | sort -u | wc -l` + n2=`cat $phone_symbol_table | grep -v -E "^#[0-9]+$" | cut -d' ' -f1 | sed 's/_[BIES]$//g' | sort -u | wc -l` $position_dependent_phones && [ $n1 -eq $n2 ] &&\ echo "$0: Position dependent phones requested, but not in provided phone symbols" && exit 1; ! $position_dependent_phones && [ $n1 -ne $n2 ] &&\ @@ -123,7 +125,7 @@ if [[ ! -z $phone_symbol_table ]]; then # Checks if the phone sets match. cat $srcdir/{,non}silence_phones.txt | awk -v f=$phone_symbol_table ' - BEGIN { while ((getline < f) > 0) { sub(/((_[BEIS])|) [0-9]+$/, "", $0); phones[$0] = 1; }} + BEGIN { while ((getline < f) > 0) { sub(/_[BEIS]$/, "", $1); phones[$1] = 1; }} { for (x = 1; x <= NF; ++x) { if (!($x in phones)) { print "Phone appears in the lexicon but not in the provided phones.txt: "$x; exit 1; }}}' || exit 1; fi @@ -133,10 +135,10 @@ if $position_dependent_phones; then # adding the markers _B, _E, _S, _I depending on word position. # In this recipe, these markers apply to silence also. # Do this starting from lexiconp.txt only. - if "$silprob"; then + if "$silprob"; then perl -ane '@A=split(" ",$_); $w = shift @A; $p = shift @A; $silword_p = shift @A; $wordsil_f = shift @A; $wordnonsil_f = shift @A; @A>0||die; - if(@A==1) { print "$w $p $silword_p $wordsil_f $wordnonsil_f $A[0]_S\n"; } + if(@A==1) { print "$w $p $silword_p $wordsil_f $wordnonsil_f $A[0]_S\n"; } else { print "$w $p $silword_p $wordsil_f $wordnonsil_f $A[0]_B "; for($n=1;$n<@A-1;$n++) { print "$A[$n]_I "; } print "$A[$n]_E\n"; } ' \ < $srcdir/lexiconp_silprob.txt > $tmpdir/lexiconp_silprob.txt @@ -158,11 +160,11 @@ if $position_dependent_phones; then mv $tmpdir/lexiconp.pre_reverse $tmpdir/lexiconp.txt fi fi - + # create $tmpdir/phone_map.txt # this has the format (on each line) # ... - # where the versions depend on the position of the phone within a word. + # where the versions depend on the position of the phone within a word. # For instance, we'd have: # AA AA_B AA_E AA_I AA_S # for (B)egin, (E)nd, (I)nternal and (S)ingleton @@ -174,11 +176,11 @@ if $position_dependent_phones; then # This phone map expands the phone lists into all the word-position-dependent # versions of the phone lists. - cat <(for x in `cat $srcdir/silence_phones.txt`; do for y in "" "" "_B" "_E" "_I" "_S"; do echo -n "$x$y "; done; echo; done) \ - <(for x in `cat $srcdir/nonsilence_phones.txt`; do for y in "" "_B" "_E" "_I" "_S"; do echo -n "$x$y "; done; echo; done) \ + cat <(set -f; for x in `cat $srcdir/silence_phones.txt`; do for y in "" "" "_B" "_E" "_I" "_S"; do echo -n "$x$y "; done; echo; done) \ + <(set -f; for x in `cat $srcdir/nonsilence_phones.txt`; do for y in "" "_B" "_E" "_I" "_S"; do echo -n "$x$y "; done; echo; done) \ > $tmpdir/phone_map.txt else - if "$silprob"; then + if "$silprob"; then cp $srcdir/lexiconp_silprob.txt $tmpdir/lexiconp_silprob.txt if $reverse; then echo "We do not support reverse option and silprob at the same time" @@ -245,10 +247,10 @@ cat $srcdir/extra_questions.txt 2>/dev/null | utils/apply_map.pl $tmpdir/phone_m # be inside a word. if $position_dependent_phones; then for suffix in _B _E _I _S; do - (for x in `cat $srcdir/nonsilence_phones.txt`; do echo -n "$x$suffix "; done; echo) >>$dir/phones/extra_questions.txt + (set -f; for x in `cat $srcdir/nonsilence_phones.txt`; do echo -n "$x$suffix "; done; echo) >>$dir/phones/extra_questions.txt done for suffix in "" _B _E _I _S; do - (for x in `cat $srcdir/silence_phones.txt`; do echo -n "$x$suffix "; done; echo) >>$dir/phones/extra_questions.txt + (set -f; for x in `cat $srcdir/silence_phones.txt`; do echo -n "$x$suffix "; done; echo) >>$dir/phones/extra_questions.txt done fi @@ -277,7 +279,7 @@ if [[ ! -z $phone_symbol_table ]]; then start_symbol=`grep \#0 $phone_symbol_table | awk '{print $2}'` echo "" | cat - $dir/phones/{silence,nonsilence}.txt | awk -v f=$phone_symbol_table ' BEGIN { while ((getline < f) > 0) { phones[$1] = $2; }} { print $1" "phones[$1]; }' | sort -k2 -g |\ - cat - <(cat $dir/phones/disambig.txt | awk -v x=$start_symbol '{n=x+NR-1; print $1, n;}') > $dir/phones.txt + cat - <(cat $dir/phones/disambig.txt | awk -v x=$start_symbol '{n=x+NR-1; print $1, n;}') > $dir/phones.txt else echo "" | cat - $dir/phones/{silence,nonsilence,disambig}.txt | \ awk '{n=NR-1; print $1, n;}' > $dir/phones.txt @@ -313,7 +315,7 @@ fi cat $tmpdir/lexiconp.txt | awk '{print $1}' | sort | uniq | awk ' BEGIN { print " 0"; - } + } { if ($1 == "") { print " is in the vocabulary!" | "cat 1>&2" @@ -362,7 +364,7 @@ cat $dir/phones/align_lexicon.txt | utils/sym2int.pl -f 3- $dir/phones.txt | \ utils/sym2int.pl -f 1-2 $dir/words.txt > $dir/phones/align_lexicon.int # Create the basic L.fst without disambiguation symbols, for use -# in training. +# in training. if $silprob; then # Usually it's the same as having a fixed-prob L.fst @@ -386,7 +388,18 @@ cat $dir/oov.txt | utils/sym2int.pl $dir/words.txt >$dir/oov.int || exit 1; # integer version of oov symbol, used in some scripts. -# Create these lists of phones in colon-separated integer list form too, +# the file wdisambig.txt contains a (line-by-line) list of the text-form of the +# disambiguation symbols that are used in the grammar and passed through by the +# lexicon. At this stage it's hardcoded as '#0', but we're laying the groundwork +# for more generality (which probably would be added by another script). +# wdisambig_words.int contains the corresponding list interpreted by the +# symbol table words.txt, and wdisambig_phones.int contains the corresponding +# list interpreted by the symbol table phones.txt. +echo '#0' >$dir/phones/wdisambig.txt +utils/sym2int.pl $dir/phones.txt <$dir/phones/wdisambig.txt >$dir/phones/wdisambig_phones.int +utils/sym2int.pl $dir/words.txt <$dir/phones/wdisambig.txt >$dir/phones/wdisambig_words.int + +# Create these lists of phones in colon-separated integer list form too, # for purposes of being given to programs as command-line options. for f in silence nonsilence optional_silence disambig context_indep; do utils/sym2int.pl $dir/phones.txt <$dir/phones/$f.txt >$dir/phones/$f.int @@ -415,20 +428,18 @@ utils/gen_topo.pl $num_nonsil_states $num_sil_states $nonsilphonelist $silphonel # Create the lexicon FST with disambiguation symbols, and put it in lang_test. # There is an extra step where we create a loop to "pass through" the # disambiguation symbols from G.fst. -phone_disambig_symbol=`grep \#0 $dir/phones.txt | awk '{print $2}'` -word_disambig_symbol=`grep \#0 $dir/words.txt | awk '{print $2}'` if $silprob; then utils/make_lexicon_fst_silprob.pl $tmpdir/lexiconp_silprob_disambig.txt $srcdir/silprob.txt $silphone '#'$ndisambig | \ fstcompile --isymbols=$dir/phones.txt --osymbols=$dir/words.txt \ --keep_isymbols=false --keep_osymbols=false | \ - fstaddselfloops "echo $phone_disambig_symbol |" "echo $word_disambig_symbol |" | \ + fstaddselfloops $dir/phones/wdisambig_phones.int $dir/phones/wdisambig_words.int | \ fstarcsort --sort_type=olabel > $dir/L_disambig.fst || exit 1; else utils/make_lexicon_fst.pl --pron-probs $tmpdir/lexiconp_disambig.txt $sil_prob $silphone '#'$ndisambig | \ fstcompile --isymbols=$dir/phones.txt --osymbols=$dir/words.txt \ --keep_isymbols=false --keep_osymbols=false | \ - fstaddselfloops "echo $phone_disambig_symbol |" "echo $word_disambig_symbol |" | \ + fstaddselfloops $dir/phones/wdisambig_phones.int $dir/phones/wdisambig_words.int | \ fstarcsort --sort_type=olabel > $dir/L_disambig.fst || exit 1; fi diff --git a/egs/wsj/s5/utils/queue.pl b/egs/wsj/s5/utils/queue.pl index 1e36de63053..cba8ff1a191 100755 --- a/egs/wsj/s5/utils/queue.pl +++ b/egs/wsj/s5/utils/queue.pl @@ -18,7 +18,7 @@ # The script now supports configuring the queue system using a config file # (default in conf/queue.conf; but can be passed specified with --config option) -# and a set of command line options. +# and a set of command line options. # The current script handles: # 1) Normal configuration arguments # For e.g. a command line option of "--gpu 1" could be converted into the option @@ -28,7 +28,7 @@ # $0 here in the line is replaced with the argument read from the CLI and the # resulting string is passed to qsub. # 2) Special arguments to options such as -# gpu=0 +# gpu=0 # If --gpu 0 is given in the command line, then no special "-q" is given. # 3) Default argument # default gpu=0 @@ -94,12 +94,12 @@ () print_usage(); } -for (my $x = 1; $x <= 3; $x++) { # This for-loop is to +for (my $x = 1; $x <= 2; $x++) { # This for-loop is to # allow the JOB=1:n option to be interleaved with the # options to qsub. while (@ARGV >= 2 && $ARGV[0] =~ m:^-:) { my $switch = shift @ARGV; - + if ($switch eq "-V") { $qsub_opts .= "-V "; } else { @@ -116,10 +116,10 @@ () $num_threads = $argument2; } elsif ($switch =~ m/^--/) { # Config options # Convert CLI option to variable name - # by removing '--' from the switch and replacing any + # by removing '--' from the switch and replacing any # '-' with a '_' $switch =~ s/^--//; - $switch =~ s/-/_/g; + $switch =~ s/-/_/g; $cli_options{$switch} = $argument; } else { # Other qsub options - passed as is $qsub_opts .= "$switch $argument "; @@ -145,7 +145,7 @@ () $jobend = $2; shift; } elsif ($ARGV[0] =~ m/.+\=.*\:.*$/) { - print STDERR "Warning: suspicious first argument to queue.pl: $ARGV[0]\n"; + print STDERR "queue.pl: Warning: suspicious first argument to queue.pl: $ARGV[0]\n"; } } @@ -155,7 +155,7 @@ () if (exists $cli_options{"config"}) { $config = $cli_options{"config"}; -} +} my $default_config_file = <<'EOF'; # Default configuration @@ -172,7 +172,7 @@ () # Here the configuration options specified by the user on the command line # (e.g. --mem 2G) are converted to options to the qsub system as defined in -# the config file. (e.g. if the config file has the line +# the config file. (e.g. if the config file has the line # "option mem=* -l ram_free=$0,mem_free=$0" # and the user has specified '--mem 2G' on the command line, the options # passed to queue system would be "-l ram_free=2G,mem_free=2G @@ -186,7 +186,7 @@ () my %cli_config_options = (); my %cli_default_options = (); -if ($opened_config_file == 0 && exists($cli_options{"config"})) { +if ($opened_config_file == 0 && exists($cli_options{"config"})) { print STDERR "Could not open config file $config\n"; exit(1); } elsif ($opened_config_file == 0 && !exists($cli_options{"config"})) { @@ -206,12 +206,12 @@ () if ($_ =~ /^command (.+)/) { $read_command = 1; $qsub_cmd = $1 . " "; - } elsif ($_ =~ m/^option ([^=]+)=\* (.+)$/) { + } elsif ($_ =~ m/^option ([^=]+)=\* (.+)$/) { # Config option that needs replacement with parameter value read from CLI # e.g.: option mem=* -l mem_free=$0,ram_free=$0 my $option = $1; # mem my $arg= $2; # -l mem_free=$0,ram_free=$0 - if ($arg !~ m:\$0:) { + if ($arg !~ m:\$0:) { die "Unable to parse line '$line' in config file ($config)\n"; } if (exists $cli_options{$option}) { @@ -231,7 +231,7 @@ () } } elsif ($_ =~ m/^default (\S+)=(\S+)/) { # Default options. Used for setting default values to options i.e. when - # the user does not specify the option on the command line + # the user does not specify the option on the command line # e.g. default gpu=0 my $option = $1; # gpu my $value = $2; # 0 @@ -291,7 +291,7 @@ () # my $cmd = ""; -foreach my $x (@ARGV) { +foreach my $x (@ARGV) { if ($x =~ m/^\S+$/) { $cmd .= $x . " "; } # If string contains no spaces, take # as-is. elsif ($x =~ m:\":) { $cmd .= "'$x' "; } # else if no dbl-quotes, use single @@ -312,19 +312,19 @@ () # make a directory called "q", # where we will put the log created by qsub... normally this doesn't contain # anything interesting, evertyhing goes to $logfile. -if (! -d "$qdir") { +if (! -d "$qdir") { system "mkdir $qdir 2>/dev/null"; sleep(5); ## This is to fix an issue we encountered in denominator lattice creation, ## where if e.g. the exp/tri2b_denlats/log/15/q directory had just been ## created and the job immediately ran, it would die with an error because nfs ## had not yet synced. I'm also decreasing the acdirmin and acdirmax in our ## NFS settings to something like 5 seconds. -} +} my $queue_array_opt = ""; if ($array_job == 1) { # It's an array job. - $queue_array_opt = "-t $jobstart:$jobend"; - $logfile =~ s/$jobname/\$SGE_TASK_ID/g; # This variable will get + $queue_array_opt = "-t $jobstart:$jobend"; + $logfile =~ s/$jobname/\$SGE_TASK_ID/g; # This variable will get # replaced by qsub, in each job, with the job-id. $cmd =~ s/$jobname/\$\{SGE_TASK_ID\}/g; # same for the command... $queue_logfile =~ s/\.?$jobname//; # the log file in the q/ subdirectory @@ -455,14 +455,14 @@ () } } - # Check that the job exists in SGE. Job can be killed if duration - # exceeds some hard limit, or in case of a machine shutdown. + # Check that the job exists in SGE. Job can be killed if duration + # exceeds some hard limit, or in case of a machine shutdown. if (($check_sge_job_ctr++ % 10) == 0) { # Don't run qstat too often, avoid stress on SGE. if ( -f $f ) { next; }; #syncfile appeared: OK. $ret = system("qstat -j $sge_job_id >/dev/null 2>/dev/null"); # system(...) : To get the actual exit value, shift $ret right by eight bits. if ($ret>>8 == 1) { # Job does not seem to exist - # Don't consider immediately missing job as error, first wait some + # Don't consider immediately missing job as error, first wait some # time to make sure it is not just delayed creation of the syncfile. sleep(3); @@ -526,7 +526,7 @@ () push @logfiles, $logfile; } else { for (my $jobid = $jobstart; $jobid <= $jobend; $jobid++) { - my $l = $logfile; + my $l = $logfile; $l =~ s/\$SGE_TASK_ID/$jobid/g; push @logfiles, $l; } diff --git a/egs/wsj/s5/utils/reverse_lm.sh b/egs/wsj/s5/utils/reverse_lm.sh index cc6b66dca03..228fff20fbe 100755 --- a/egs/wsj/s5/utils/reverse_lm.sh +++ b/egs/wsj/s5/utils/reverse_lm.sh @@ -38,25 +38,13 @@ mkdir -p $outdir for f in phones.txt words.txt L.fst L_disambig.fst phones/; do cp -r $langdir/$f $outdir done -gunzip -c $lm | utils/find_arpa_oovs.pl $outdir/words.txt > $tmpdir/oovs.txt -# grep -v ' ' because the LM seems to have some strange and useless -# stuff in it with multiple 's in the history. Encountered some other similar -# things in a LM from Geoff. Removing all "illegal" combinations of and , -# which are supposed to occur only at being/end of utt. These can cause -# determinization failures of CLG [ends up being epsilon cycles]. -gunzip -c $lm | \ - grep -v ' ' | \ - grep -v ' ' | \ - grep -v ' ' > $outdir/forward.arpa +gunzip -c $lm > $outdir/forward.arpa echo "Mapping ARPA to reverse ARPA" python utils/reverse_arpa.py $outdir/forward.arpa > $outdir/reverse.arpa -arpa2fst $outdir/reverse.arpa | fstprint | \ - grep -v "230258.5" | \ - utils/remove_oovs.pl $tmpdir/oovs.txt | \ - utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$outdir/words.txt \ - --osymbols=$outdir/words.txt --keep_isymbols=false --keep_osymbols=false \ - | fstrmepsilon > $outdir/G_org.fst +arpa2fst --disambig-symbol=#0 --read-symbol-table=$outdir/words.txt \ + $outdir/reverse.arpa | \ + fstprint | fgrep -v '230258.5' | fstcompile > $outdir/G_org.fst #--arc_type=log echo "Push weights to make it stochastic (log semi-ring)" @@ -84,7 +72,7 @@ if [ -f $lexicon ]; then < "$lexicon" >$tmpdir/g/select_empty.fst.txt fstcompile --isymbols=$outdir/words.txt --osymbols=$outdir/words.txt $tmpdir/g/select_empty.fst.txt | \ fstarcsort --sort_type=olabel | fstcompose - $outdir/G.fst > $tmpdir/g/empty_words.fst - fstinfo $tmpdir/g/empty_words.fst | grep cyclic | grep -w 'y' && + fstinfo $tmpdir/g/empty_words.fst | grep cyclic | grep -w 'y' && echo "Language model has cycles with empty words" && exit 1 rm -r $tmpdir/g fi diff --git a/egs/wsj/s5/utils/rnnlm_compute_scores.sh b/egs/wsj/s5/utils/rnnlm_compute_scores.sh index 75c4c262c49..060b645aca3 100755 --- a/egs/wsj/s5/utils/rnnlm_compute_scores.sh +++ b/egs/wsj/s5/utils/rnnlm_compute_scores.sh @@ -62,8 +62,16 @@ cat $tempdir/text | awk -v voc=$dir/wordlist.rnn -v unk=$dir/unk.probs \ # OK, now we compute the scores on the text with OOVs replaced # with -$rnnlm -independent -rnnlm $dir/rnnlm -test $tempdir/text.nounk -nbest -debug 0 | \ - awk '{print $1*log(10);}' > $tempdir/loglikes.rnn +if [ $rnnlm_ver == "faster-rnnlm" ]; then + $rnnlm -independent -rnnlm $dir/rnnlm -test $tempdir/text.nounk -nbest -debug 0 | \ + awk '{print $1*log(10);}' > $tempdir/loglikes.rnn +else + # add the utterance_id as required by Mikolove's rnnlm + paste $tempdir/ids $tempdir/text.nounk > $tempdir/id_text.nounk + + $rnnlm -independent -rnnlm $dir/rnnlm -test $tempdir/id_text.nounk -nbest -debug 0 | \ + awk '{print $1*log(10);}' > $tempdir/loglikes.rnn +fi [ `cat $tempdir/loglikes.rnn | wc -l` -ne `cat $tempdir/loglikes.oov | wc -l` ] && \ echo "rnnlm rescoring failed" && exit 1; diff --git a/egs/wsj/s5/utils/run.pl b/egs/wsj/s5/utils/run.pl index 6145a7ac54f..7df65c086d9 100755 --- a/egs/wsj/s5/utils/run.pl +++ b/egs/wsj/s5/utils/run.pl @@ -1,7 +1,7 @@ #!/usr/bin/env perl use warnings; #sed replacement for -w perl parameter -# In general, doing +# In general, doing # run.pl some.log a b c is like running the command a b c in # the bash shell, and putting the standard error and output into some.log. # To run parallel jobs (backgrounded on the host machine), you can do (e.g.) @@ -13,7 +13,7 @@ # run.pl some.log my-prog "--opt=foo bar" foo \| other-prog baz # and run.pl will run something like: # ( my-prog '--opt=foo bar' foo | other-prog baz ) >& some.log -# +# # Basically it takes the command-line arguments, quotes them # as necessary to preserve spaces, and evaluates them with bash. # In addition it puts the command line at the top of the log, and @@ -35,10 +35,12 @@ # options that would normally be given to # queue.pl, which we will just discard. -if (@ARGV > 0) { - while (@ARGV >= 2 && $ARGV[0] =~ m:^-:) { # parse any options - # that would normally go to qsub, but which will be ignored here. - $switch = shift @ARGV; +for (my $x = 1; $x <= 2; $x++) { # This for-loop is to + # allow the JOB=1:n option to be interleaved with the + # options to qsub. + while (@ARGV >= 2 && $ARGV[0] =~ m:^-:) { + # parse any options that would normally go to qsub, but which will be ignored here. + my $switch = shift @ARGV; if ($switch eq "-V") { $ignored_opts .= "-V "; } elsif ($switch eq "--max-jobs-run" || $switch eq "-tc") { @@ -48,19 +50,26 @@ die "run.pl: invalid option --max-jobs-run $max_jobs_run"; } } else { - $option = shift @ARGV; - if ($switch eq "-sync" && $option =~ m/^[yY]/) { + my $argument = shift @ARGV; + if ($argument =~ m/^--/) { + print STDERR "WARNING: suspicious argument '$argument' to $switch; starts with '-'\n"; + } + if ($switch eq "-sync" && $argument =~ m/^[yY]/) { $ignored_opts .= "-sync "; # Note: in the # corresponding code in queue.pl it says instead, just "$sync = 1;". - } - $ignored_opts .= "$switch $option "; - if ($switch eq "-pe") { # e.g. -pe smp 5 - $option2 = shift @ARGV; - $ignored_opts .= "$option2 "; + } elsif ($switch eq "-pe") { # e.g. -pe smp 5 + my $argument2 = shift @ARGV; + $ignored_opts .= "$switch $argument $argument2 "; + } elsif ($switch =~ m/^--/) { # Config options + # Convert CLI new-style options + # Ignore all options + $ignored_opts .= "$switch $argument "; + } else { # Other qsub options - passed as is + $ignored_opts .= "$switch $argument "; } } } - if ($ARGV[0] =~ m/^([\w_][\w\d_]*)+=(\d+):(\d+)$/) { # e.g. JOB=1:10 + if ($ARGV[0] =~ m/^([\w_][\w\d_]*)+=(\d+):(\d+)$/) { # e.g. JOB=1:20 $jobname = $1; $jobstart = $2; $jobend = $3; @@ -83,7 +92,7 @@ # Users found this message confusing so we are removing it. # if ($ignored_opts ne "") { -# print STDERR "run.pl: Warning: ignoring options \"$ignored_opts\"\n"; +# print STDERR "run.pl: Warning: ignoring options \"$ignored_opts\"\n"; # } if ($max_jobs_run == -1) { # If --max-jobs-run option not set, @@ -136,10 +145,10 @@ $cmd = ""; -foreach $x (@ARGV) { +foreach $x (@ARGV) { if ($x =~ m/^\S+$/) { $cmd .= $x . " "; } elsif ($x =~ m:\":) { $cmd .= "'$x' "; } - else { $cmd .= "\"$x\" "; } + else { $cmd .= "\"$x\" "; } } #$Data::Dumper::Indent=0; @@ -150,7 +159,7 @@ use POSIX ":sys_wait_h"; for ($jobid = $jobstart; $jobid <= $jobend; $jobid++) { if (scalar(keys %active_pids) >= $max_jobs_run) { - + # Lets wait for a change in any child's status # Then we have to work out which child finished $r = waitpid(-1, 0); @@ -158,7 +167,7 @@ if ($r < 0 ) { die "run.pl: Error waiting for child process"; } # should never happen. if ( defined $active_pids{$r} ) { $jid=$active_pids{$r}; - $fail[$jid]=$code; + $fail[$jid]=$code; if ($code !=0) { $numfail++;} delete $active_pids{$r}; # print STDERR "Finished: $r/$jid " . Dumper(\%active_pids) . "\n"; @@ -166,7 +175,7 @@ die "run.pl: Cannot find the PID of the chold process that just finished."; } - # In theory we could do a non-blocking waitpid over all jobs running just + # In theory we could do a non-blocking waitpid over all jobs running just # to find out if only one or more jobs finished during the previous waitpid() # However, we just omit this and will reap the next one in the next pass # through the for(;;) cycle @@ -175,7 +184,7 @@ if (!defined $childpid) { die "run.pl: Error forking in run.pl (writing to $logfile)"; } if ($childpid == 0) { # We're in the child... this branch # executes the job and returns (possibly with an error status). - if (defined $jobname) { + if (defined $jobname) { $cmd =~ s/$jobname/$jobid/g; $logfile =~ s/$jobname/$jobid/g; } @@ -188,7 +197,7 @@ close(F); # Pipe into bash.. make sure we're not using any other shell. - open(B, "|bash") || die "run.pl: Error opening shell command"; + open(B, "|bash") || die "run.pl: Error opening shell command"; print B "( " . $cmd . ") 2>>$logfile >> $logfile"; close(B); # If there was an error, exit status is in $? $ret = $?; @@ -231,9 +240,9 @@ $job_return = $fail[$jobid]; if (not defined $job_return ) { # print Dumper(\@fail); - - die "run.pl: Sanity check failed: we have indication that some jobs are running " . - "even after we waited for all jobs to finish" ; + + die "run.pl: Sanity check failed: we have indication that some jobs are running " . + "even after we waited for all jobs to finish" ; } if ($job_return != 0 ){ $failed_jids++;} } @@ -244,7 +253,7 @@ if ($ret != 0) { $njobs = $jobend - $jobstart + 1; - if ($njobs == 1) { + if ($njobs == 1) { if (defined $jobname) { $logfile =~ s/$jobname/$jobstart/; # only one numbered job, so replace name with # that job. diff --git a/egs/wsj/s5/utils/scoring/wer_per_spk_details.pl b/egs/wsj/s5/utils/scoring/wer_per_spk_details.pl index 710da8a4b4c..6db8ea7455f 100755 --- a/egs/wsj/s5/utils/scoring/wer_per_spk_details.pl +++ b/egs/wsj/s5/utils/scoring/wer_per_spk_details.pl @@ -96,6 +96,8 @@ sub format_sys { my @F=split; die "Incompatible format of the utt2spk file: $_" if @F != 2; $UTTMAP{$F[0]} = $F[1]; + # Set width of speaker column by its longest label, + if($SPK_WIDTH < length($F[1])) { $SPK_WIDTH = length($F[1]) } } close(UTT2SPK); diff --git a/egs/wsj/s5/utils/slurm.pl b/egs/wsj/s5/utils/slurm.pl index 68c269080ac..8095272732e 100755 --- a/egs/wsj/s5/utils/slurm.pl +++ b/egs/wsj/s5/utils/slurm.pl @@ -11,7 +11,7 @@ use Cwd; use Getopt::Long; -# slurm.pl was created from the queue.pl +# slurm.pl was created from the queue.pl # queue.pl has the same functionality as run.pl, except that # it runs the job in question on the queue (Sun GridEngine). # This version of queue.pl uses the task array functionality @@ -20,7 +20,7 @@ # The script now supports configuring the queue system using a config file # (default in conf/queue.conf; but can be passed specified with --config option) -# and a set of command line options. +# and a set of command line options. # The current script handles: # 1) Normal configuration arguments # For e.g. a command line option of "--gpu 1" could be converted into the option @@ -30,7 +30,7 @@ # $0 here in the line is replaced with the argument read from the CLI and the # resulting string is passed to qsub. # 2) Special arguments to options such as -# gpu=0 +# gpu=0 # If --gpu 0 is given in the command line, then no special "-q" is given. # 3) Default argument # default gpu=0 @@ -60,7 +60,7 @@ my $qsub_opts = ""; my $sync = 0; my $num_threads = 1; -my $max_jobs_run; +my $max_jobs_run; my $gpu = 0; my $config = "conf/slurm.conf"; @@ -99,12 +99,12 @@ () print_usage(); } -for (my $x = 1; $x <= 3; $x++) { # This for-loop is to +for (my $x = 1; $x <= 3; $x++) { # This for-loop is to # allow the JOB=1:n option to be interleaved with the # options to qsub. while (@ARGV >= 2 && $ARGV[0] =~ m:^-:) { my $switch = shift @ARGV; - + if ($switch eq "-V") { $qsub_opts .= "-V "; } else { @@ -121,10 +121,10 @@ () $num_threads = $argument2; } elsif ($switch =~ m/^--/) { # Config options # Convert CLI option to variable name - # by removing '--' from the switch and replacing any + # by removing '--' from the switch and replacing any # '-' with a '_' $switch =~ s/^--//; - $switch =~ s/-/_/g; + $switch =~ s/-/_/g; $cli_options{$switch} = $argument; } else { # Other qsub options - passed as is $qsub_opts .= "$switch $argument "; @@ -160,7 +160,7 @@ () if (exists $cli_options{"config"}) { $config = $cli_options{"config"}; -} +} my $default_config_file = <<'EOF'; # Default configuration @@ -168,17 +168,18 @@ () option time=* --time $0 option mem=* --mem-per-cpu $0 option mem=0 # Do not add anything to qsub_opts -option num_threads=* --cpus-per-task $0 --ntasks-per-node=1 +option num_threads=* --cpus-per-task $0 --ntasks-per-node=1 option num_threads=1 --cpus-per-task 1 --ntasks-per-node=1 # Do not add anything to qsub_opts -option max_jobs_run=* # Do nothing default gpu=0 option gpu=0 -p shared option gpu=* -p gpu --gres=gpu:$0 --time 4:0:0 # this has to be figured out +# note: the --max-jobs-run option is supported as a special case +# by slurm.pl and you don't have to handle it in the config file. EOF # Here the configuration options specified by the user on the command line # (e.g. --mem 2G) are converted to options to the qsub system as defined in -# the config file. (e.g. if the config file has the line +# the config file. (e.g. if the config file has the line # "option mem=* -l ram_free=$0,mem_free=$0" # and the user has specified '--mem 2G' on the command line, the options # passed to queue system would be "-l ram_free=2G,mem_free=2G @@ -192,7 +193,7 @@ () my %cli_config_options = (); my %cli_default_options = (); -if ($opened_config_file == 0 && exists($cli_options{"config"})) { +if ($opened_config_file == 0 && exists($cli_options{"config"})) { print STDERR "Could not open config file $config\n"; exit(1); } elsif ($opened_config_file == 0 && !exists($cli_options{"config"})) { @@ -212,12 +213,12 @@ () if ($_ =~ /^command (.+)/) { $read_command = 1; $qsub_cmd = $1 . " "; - } elsif ($_ =~ m/^option ([^=]+)=\* (.+)$/) { + } elsif ($_ =~ m/^option ([^=]+)=\* (.+)$/) { # Config option that needs replacement with parameter value read from CLI # e.g.: option mem=* -l mem_free=$0,ram_free=$0 my $option = $1; # mem my $arg= $2; # -l mem_free=$0,ram_free=$0 - if ($arg !~ m:\$0:) { + if ($arg !~ m:\$0:) { print STDERR "Warning: the line '$line' in config file ($config) does not substitution variable \$0\n"; } if (exists $cli_options{$option}) { @@ -237,7 +238,7 @@ () } } elsif ($_ =~ m/^default (\S+)=(\S+)/) { # Default options. Used for setting default values to options i.e. when - # the user does not specify the option on the command line + # the user does not specify the option on the command line # e.g. default gpu=0 my $option = $1; # gpu my $value = $2; # 0 @@ -261,19 +262,25 @@ () for my $option (keys %cli_options) { if ($option eq "config") { next; } - if ($option eq "max_jobs_run" && $array_job != 1) { print STDERR "Ignoring $option\n"; next; } + my $value = $cli_options{$option}; - - if ($option eq "max_jobs_run") { $max_jobs_run = $value; } - if (exists $cli_default_options{($option,$value)}) { + if ($option eq "max_jobs_run") { + if ($array_job != 1) { + print STDERR "Ignoring $option since this is not an array task."; + } else { + $max_jobs_run = $value; + } + } elsif (exists $cli_default_options{($option,$value)}) { $qsub_opts .= "$cli_default_options{($option,$value)} "; } elsif (exists $cli_config_options{$option}) { $qsub_opts .= "$cli_config_options{$option} "; } elsif (exists $cli_default_options{($option,"*")}) { $qsub_opts .= $cli_default_options{($option,"*")} . " "; } else { - if ($opened_config_file == 0) { $config = "default config file"; } + if ($opened_config_file == 0) { + $config = "default config file"; + } die "$0: Command line option $option not described in $config (or value '$value' not allowed)\n"; } } @@ -301,7 +308,7 @@ () # my $cmd = ""; -foreach my $x (@ARGV) { +foreach my $x (@ARGV) { if ($x =~ m/^\S+$/) { $cmd .= $x . " "; } # If string contains no spaces, take # as-is. elsif ($x =~ m:\":) { $cmd .= "'$x' "; } # else if no dbl-quotes, use single @@ -322,23 +329,23 @@ () # make a directory called "q", # where we will put the log created by qsub... normally this doesn't contain # anything interesting, evertyhing goes to $logfile. -if (! -d "$qdir") { +if (! -d "$qdir") { system "mkdir $qdir 2>/dev/null"; sleep(5); ## This is to fix an issue we encountered in denominator lattice creation, ## where if e.g. the exp/tri2b_denlats/log/15/q directory had just been ## created and the job immediately ran, it would die with an error because nfs ## had not yet synced. I'm also decreasing the acdirmin and acdirmax in our ## NFS settings to something like 5 seconds. -} +} my $queue_array_opt = ""; if ($array_job == 1) { # It's an array job. if ($max_jobs_run) { - $queue_array_opt = "--array ${jobstart}-${jobend}%${max_jobs_run}"; + $queue_array_opt = "--array ${jobstart}-${jobend}%${max_jobs_run}"; } else { - $queue_array_opt = "--array ${jobstart}-${jobend}"; + $queue_array_opt = "--array ${jobstart}-${jobend}"; } - $logfile =~ s/$jobname/\$SLURM_ARRAY_TASK_ID/g; # This variable will get + $logfile =~ s/$jobname/\$SLURM_ARRAY_TASK_ID/g; # This variable will get # replaced by qsub, in each job, with the job-id. $cmd =~ s/$jobname/\$\{SLURM_ARRAY_TASK_ID\}/g; # same for the command... $queue_logfile =~ s/\.?$jobname//; # the log file in the q/ subdirectory @@ -475,14 +482,14 @@ () } } - # Check that the job exists in SLURM. Job can be killed if duration - # exceeds some hard limit, or in case of a machine shutdown. + # Check that the job exists in SLURM. Job can be killed if duration + # exceeds some hard limit, or in case of a machine shutdown. if (($check_sge_job_ctr++ % 10) == 0) { # Don't run qstat too often, avoid stress on SGE. if ( -f $f ) { next; }; #syncfile appeared: OK. $ret = system("squeue -j $sge_job_id >/dev/null 2>/dev/null"); # system(...) : To get the actual exit value, shift $ret right by eight bits. if ($ret>>8 == 1) { # Job does not seem to exist - # Don't consider immediately missing job as error, first wait some + # Don't consider immediately missing job as error, first wait some # time to make sure it is not just delayed creation of the syncfile. sleep(3); @@ -546,7 +553,7 @@ () push @logfiles, $logfile; } else { for (my $jobid = $jobstart; $jobid <= $jobend; $jobid++) { - my $l = $logfile; + my $l = $logfile; $l =~ s/\$SLURM_ARRAY_TASK_ID/$jobid/g; push @logfiles, $l; } diff --git a/egs/wsj/s5/utils/split_data.sh b/egs/wsj/s5/utils/split_data.sh index 941890cdd57..c6b501e2b0c 100755 --- a/egs/wsj/s5/utils/split_data.sh +++ b/egs/wsj/s5/utils/split_data.sh @@ -1,5 +1,5 @@ #!/bin/bash -# Copyright 2010-2013 Microsoft Corporation +# Copyright 2010-2013 Microsoft Corporation # Johns Hopkins University (Author: Daniel Povey) # Licensed under the Apache License, Version 2.0 (the "License"); @@ -56,9 +56,9 @@ if [ -f $data/text ] && [ $nu -ne $nt ]; then fi s1=$data/split$numsplit/1 -if [ ! -d $s1 ]; then +if [ ! -d $s1 ]; then need_to_split=true -else +else need_to_split=false for f in utt2spk spk2utt spk2warp feats.scp text wav.scp cmvn.scp spk2gender \ vad.scp segments reco2file_and_channel utt2lang; do @@ -71,11 +71,17 @@ fi if ! $need_to_split; then exit 0; fi - -for n in `seq $numsplit`; do - mkdir -p $data/split$numsplit/$n - utt2spks="$utt2spks $data/split$numsplit/$n/utt2spk" -done + +utt2spks=$(for n in `seq $numsplit`; do echo $data/split$numsplit/$n/utt2spk; done) + +directories=$(for n in `seq $numsplit`; do echo $data/split$numsplit/$n; done) + +# if this mkdir fails due to argument-list being too long, iterate. +if ! mkdir -p $directories >&/dev/null; then + for n in `seq $numsplit`; do + mkdir -p $data/split$numsplit/$n + done +fi if $split_per_spk; then utt2spk_opt="--utt2spk=$data/utt2spk" @@ -84,7 +90,8 @@ else fi # If lockfile is not installed, just don't lock it. It's not a big deal. -which lockfile >&/dev/null && lockfile -l 60 $data/.split_lock +which lockfile >&/dev/null && lockfile -l 60 $data/.split_lock +trap 'rm -f $data/.split_lock' EXIT HUP INT PIPE TERM utils/split_scp.pl $utt2spk_opt $data/utt2spk $utt2spks || exit 1 @@ -115,21 +122,24 @@ for f in spk2gender spk2warp cmvn.scp; do fi done -for n in `seq $numsplit`; do - dsn=$data/split$numsplit/$n - if [ -f $data/segments ]; then - utils/filter_scp.pl $dsn/utt2spk $data/segments > $dsn/segments - awk '{print $2;}' $dsn/segments | sort | uniq > $data/tmp.reco # recording-ids. - if [ -f $data/reco2file_and_channel ]; then - utils/filter_scp.pl $data/tmp.reco $data/reco2file_and_channel > $dsn/reco2file_and_channel - fi - if [ -f $data/wav.scp ]; then - utils/filter_scp.pl $data/tmp.reco $data/wav.scp >$dsn/wav.scp - fi - rm $data/tmp.reco - fi # else it would have been handled above, see maybe_wav. -done - -rm -f $data/.split_lock +if [ -f $data/segments ]; then + utils/filter_scps.pl JOB=1:$numsplit \ + $data/split$numsplit/JOB/utt2spk $data/segments $data/split$numsplit/JOB/segments || exit 1 + for n in `seq $numsplit`; do + dsn=$data/split$numsplit/$n + awk '{print $2;}' $dsn/segments | sort | uniq > $dsn/tmp.reco # recording-ids. + done + if [ -f $data/reco2file_and_channel ]; then + utils/filter_scps.pl JOB=1:$numsplit \ + $data/split$numsplit/JOB/tmp.reco $data/reco2file_and_channel \ + $data/split$numsplit/JOB/reco2file_and_channel || exit 1 + fi + if [ -f $data/wav.scp ]; then + utils/filter_scps.pl JOB=1:$numsplit \ + $data/split$numsplit/JOB/tmp.reco $data/wav.scp \ + $data/split$numsplit/JOB/wav.scp || exit 1 + fi + for f in $data/split$numsplit/*/tmp.reco; do rm $f; done +fi exit 0 diff --git a/egs/wsj/s5/utils/split_scp.pl b/egs/wsj/s5/utils/split_scp.pl index 70bc8033c9d..be2767ccb8d 100755 --- a/egs/wsj/s5/utils/split_scp.pl +++ b/egs/wsj/s5/utils/split_scp.pl @@ -72,7 +72,7 @@ @OUTPUTS = @ARGV; } else { for ($j = 0; $j < $num_jobs; $j++) { - if ($j == $job_id) { + if ($j == $job_id) { if (@ARGV > 0) { push @OUTPUTS, $ARGV[0]; } else { push @OUTPUTS, "-"; } } else { @@ -98,12 +98,12 @@ $s = $utt2spk{$u}; if(!defined $s) { die "No such utterance $u in utt2spk file $utt2spk_file"; } if(!defined $spk_count{$s}) { - push @spkrs, $s; + push @spkrs, $s; $spk_count{$s} = 0; - $spk_data{$s} = ""; + $spk_data{$s} = []; # ref to new empty array. } $spk_count{$s}++; - $spk_data{$s} = $spk_data{$s} . $_; + push @{$spk_data{$s}}, $_; } # Now split as equally as possible .. # First allocate spks to files by allocating an approximately @@ -182,7 +182,7 @@ $error = 1; } else { foreach $spk ( @{$scparray[$scpidx]} ) { - print F $spk_data{$spk}; + print F @{$spk_data{$spk}}; $count += $spk_count{$spk}; } if($count != $scpcount[$scpidx]) { die "Count mismatch [code error]"; } @@ -190,7 +190,7 @@ close(F); } } else { - # This block is the "normal" case where there is no --utt2spk + # This block is the "normal" case where there is no --utt2spk # option and we just break into equal size chunks. open(I, "<$inscp") || die "Opening input scp file $inscp"; diff --git a/egs/wsj/s5/utils/subset_data_dir.sh b/egs/wsj/s5/utils/subset_data_dir.sh index be74ac8c177..154b9c81c0a 100755 --- a/egs/wsj/s5/utils/subset_data_dir.sh +++ b/egs/wsj/s5/utils/subset_data_dir.sh @@ -106,6 +106,7 @@ function do_filtering { [ -f $srcdir/feats.scp ] && utils/filter_scp.pl $destdir/utt2spk <$srcdir/feats.scp >$destdir/feats.scp [ -f $srcdir/vad.scp ] && utils/filter_scp.pl $destdir/utt2spk <$srcdir/vad.scp >$destdir/vad.scp [ -f $srcdir/utt2lang ] && utils/filter_scp.pl $destdir/utt2spk <$srcdir/utt2lang >$destdir/utt2lang + [ -f $srcdir/utt2dur ] && utils/filter_scp.pl $destdir/utt2spk <$srcdir/utt2dur >$destdir/utt2dur [ -f $srcdir/wav.scp ] && utils/filter_scp.pl $destdir/utt2spk <$srcdir/wav.scp >$destdir/wav.scp [ -f $srcdir/spk2warp ] && utils/filter_scp.pl $destdir/spk2utt <$srcdir/spk2warp >$destdir/spk2warp [ -f $srcdir/utt2warp ] && utils/filter_scp.pl $destdir/utt2spk <$srcdir/utt2warp >$destdir/utt2warp @@ -159,7 +160,7 @@ elif $perspk; then do_filtering; # bash function. exit 0; else - if [ $numutt -gt `cat $srcdir/feats.scp | wc -l` ]; then + if [ $numutt -gt `cat $srcdir/utt2spk | wc -l` ]; then echo "subset_data_dir.sh: cannot subset to more utterances than you originally had." exit 1; fi diff --git a/egs/wsj/s5/utils/subset_scp.pl b/egs/wsj/s5/utils/subset_scp.pl index a8bcdfc1fc3..11fddc09a0f 100755 --- a/egs/wsj/s5/utils/subset_scp.pl +++ b/egs/wsj/s5/utils/subset_scp.pl @@ -71,23 +71,27 @@ } sub select_n { - my ($start,$end,$num_needed) = @_; - my $diff = $end - $start; - if($num_needed > $diff) { die "select_n: code error"; } - if($diff == 1 ) { - if($num_needed > 0) { - print $F[$start]; - } - } else { - my $halfdiff = int($diff/2); - my $halfneeded = int($num_needed/2); - select_n($start, $start+$halfdiff, $halfneeded); - select_n($start+$halfdiff, $end, $num_needed - $halfneeded); + my ($start,$end,$num_needed) = @_; + my $diff = $end - $start; + if ($num_needed > $diff) { + die "select_n: code error"; + } + if ($diff == 1 ) { + if ($num_needed > 0) { + print $F[$start]; } + } else { + my $halfdiff = int($diff/2); + my $halfneeded = int($num_needed/2); + select_n($start, $start+$halfdiff, $halfneeded); + select_n($start+$halfdiff, $end, $num_needed - $halfneeded); + } } if ( ! $first && ! $last) { - select_n(0, $numlines, $N); + if ($N > 0) { + select_n(0, $numlines, $N); + } } else { if ($first) { # --first option: same as head. for ($n = 0; $n < $N; $n++) { diff --git a/egs/wsj/s5/utils/validate_data_dir.sh b/egs/wsj/s5/utils/validate_data_dir.sh index da962177bef..19452c3c235 100755 --- a/egs/wsj/s5/utils/validate_data_dir.sh +++ b/egs/wsj/s5/utils/validate_data_dir.sh @@ -133,7 +133,7 @@ if [ -f $data/wav.scp ]; then ! cat $data/segments | \ awk '{if (NF != 4 || ($4 <= $3 && $4 != -1)) { print "Bad line in segments file", $0; exit(1); }}' && \ echo "$0: badly formatted segments file" && exit 1; - + segments_len=`cat $data/segments | wc -l` if [ -f $data/text ]; then ! cmp -s $tmpdir/utts <(awk '{print $1}' <$data/text) && \ @@ -153,14 +153,14 @@ if [ -f $data/wav.scp ]; then # this file is needed only for ctm scoring; it's indexed by recording-id. check_sorted_and_uniq $data/reco2file_and_channel ! cat $data/reco2file_and_channel | \ - awk '{if (NF != 3 || ($3 != "A" && $3 != "B" )) { + awk '{if (NF != 3 || ($3 != "A" && $3 != "B" )) { if ( NF == 3 && $3 == "1" ) { warning_issued = 1; } else { - print "Bad line ", $0; exit 1; + print "Bad line ", $0; exit 1; } } - } + } END { if (warning_issued == 1) { print "The channel should be marked as A or B, not 1! You should change it ASAP! " @@ -188,14 +188,14 @@ if [ -f $data/wav.scp ]; then # this file is needed only for ctm scoring; it's indexed by recording-id. check_sorted_and_uniq $data/reco2file_and_channel ! cat $data/reco2file_and_channel | \ - awk '{if (NF != 3 || ($3 != "A" && $3 != "B" )) { + awk '{if (NF != 3 || ($3 != "A" && $3 != "B" )) { if ( NF == 3 && $3 == "1" ) { warning_issued = 1; } else { - print "Bad line ", $0; exit 1; + print "Bad line ", $0; exit 1; } } - } + } END { if (warning_issued == 1) { print "The channel should be marked as A or B, not 1! You should change it ASAP! " @@ -228,6 +228,7 @@ if [ -f $data/feats.scp ]; then fi fi + if [ -f $data/cmvn.scp ]; then check_sorted_and_uniq $data/cmvn.scp cat $data/cmvn.scp | awk '{print $1}' > $tmpdir/speakers.cmvn @@ -283,7 +284,7 @@ if [ -f $data/utt2warp ]; then fi # check some optionally-required things -for f in vad.scp utt2lang; do +for f in vad.scp utt2lang utt2uniq; do if [ -f $data/$f ]; then check_sorted_and_uniq $data/$f if ! cmp -s <( awk '{print $1}' $data/utt2spk ) \ @@ -294,4 +295,19 @@ for f in vad.scp utt2lang; do fi done + +if [ -f $data/utt2dur ]; then + check_sorted_and_uniq $data/utt2dur + cat $data/utt2dur | awk '{print $1}' > $tmpdir/utts.utt2dur + if ! cmp -s $tmpdir/utts{,.utt2dur}; then + echo "$0: Error: in $data, utterance-ids extracted from utt2spk and utt2dur file" + echo "$0: differ, partial diff is:" + partial_diff $tmpdir/utts{,.feats} + exit 1; + fi + cat $data/utt2dur | \ + awk '{ if (NF != 2 || !($2 > 0)) { print "Bad line : " $0; exit(1) }}' || exit 1 +fi + + echo "$0: Successfully validated data-directory $data" diff --git a/egs/wsj/s5/utils/validate_dict_dir.pl b/egs/wsj/s5/utils/validate_dict_dir.pl index ca33f84c8c4..5cc04c1e6ff 100755 --- a/egs/wsj/s5/utils/validate_dict_dir.pl +++ b/egs/wsj/s5/utils/validate_dict_dir.pl @@ -25,6 +25,7 @@ if(!open(S, "<$dict/silence_phones.txt")) {print "--> ERROR: fail to open $dict/silence_phones.txt\n"; exit 1;} $idx = 1; %silence = (); +$crlf = 1; print "--> reading $dict/silence_phones.txt\n"; while() { @@ -32,19 +33,24 @@ print "--> ERROR: last line '$_' of $dict/silence_phones.txt does not end in newline.\n"; set_to_fail(); } + if ($crlf == 1 && m/\r/) { + print "--> ERROR: $dict/silence_phones.txt contains Carriage Return (^M) characters.\n"; + set_to_fail(); + $crlf = 0; + } my @col = split(" ", $_); if (@col == 0) { - set_to_fail(); - print "--> ERROR: empty line in $dict/silence_phones.txt (line $idx)\n"; + set_to_fail(); + print "--> ERROR: empty line in $dict/silence_phones.txt (line $idx)\n"; } foreach(0 .. @col-1) { my $p = $col[$_]; if($silence{$p}) {set_to_fail(); print "--> ERROR: phone \"$p\" duplicates in $dict/silence_phones.txt (line $idx)\n"; } else {$silence{$p} = 1;} - if ($p =~ m/_$/ || $p =~ m/#/ || $p =~ m/_[BESI]$/){ + if ($p =~ m/#(\d)+/ || $p =~ m/_[BESI]$/){ set_to_fail(); - print "--> ERROR: phone \"$p\" has disallowed written form"; - + print "--> ERROR: phone \"$p\" has disallowed written form\n"; + } } $idx ++; @@ -59,14 +65,20 @@ if(!open(OS, "<$dict/optional_silence.txt")) {print "--> ERROR: fail to open $dict/optional_silence.txt\n"; exit 1;} $idx = 1; $success = 1; +$crlf = 1; print "--> reading $dict/optional_silence.txt\n"; while() { chomp; my @col = split(" ", $_); if ($idx > 1 or @col > 1) { - set_to_fail(); print "--> ERROR: only 1 phone expected in $dict/optional_silence.txt\n"; + set_to_fail(); print "--> ERROR: only 1 phone expected in $dict/optional_silence.txt\n"; } elsif (!$silence{$col[0]}) { - set_to_fail(); print "--> ERROR: phone $col[0] not found in $dict/silence_phones.txt\n"; + set_to_fail(); print "--> ERROR: phone $col[0] not found in $dict/silence_phones.txt\n"; + } + if ($crlf == 1 && m/\r/) { + print "--> ERROR: $dict/optional_silence.txt contains Carriage Return (^M) characters.\n"; + set_to_fail(); + $crlf = 0; } $idx ++; } @@ -81,25 +93,31 @@ $idx = 1; %nonsilence = (); $success = 1; +$crlf = 1; print "--> reading $dict/nonsilence_phones.txt\n"; while() { + if ($crlf == 1 && m/\r/) { + print "--> ERROR: $dict/nonsilence_phones.txt contains Carriage Return (^M) characters.\n"; + set_to_fail(); + $crlf = 0; + } if (! s/\n$//) { print "--> ERROR: last line '$_' of $dict/nonsilence_phones.txt does not end in newline.\n"; set_to_fail(); } my @col = split(" ", $_); if (@col == 0) { - set_to_fail(); - print "--> ERROR: empty line in $dict/nonsilence_phones.txt (line $idx)\n"; + set_to_fail(); + print "--> ERROR: empty line in $dict/nonsilence_phones.txt (line $idx)\n"; } foreach(0 .. @col-1) { my $p = $col[$_]; if($nonsilence{$p}) {set_to_fail(); print "--> ERROR: phone \"$p\" duplicates in $dict/nonsilence_phones.txt (line $idx)\n"; } else {$nonsilence{$p} = 1;} - if ($p =~ m/_$/ || $p =~ m/#/ || $p =~ m/_[BESI]$/){ + if ($p =~ m/#(\d)+/ || $p =~ m/_[BESI]$/){ set_to_fail(); - print "--> ERROR: phone \"$p\" has disallowed written form"; - + print "--> ERROR: phone \"$p\" has disallowed written form\n"; + } } $idx ++; @@ -134,9 +152,14 @@ sub check_lexicon { print "Checking $lex\n"; !open(L, "<$lex") && print "--> ERROR: fail to open $lex\n" && set_to_fail(); my %seen_line = {}; - $idx = 1; $success = 1; + $idx = 1; $success = 1; $crlf = 1; print "--> reading $lex\n"; while () { + if ($crlf == 1 && m/\r/) { + print "--> ERROR: $lex contains Carriage Return (^M) characters.\n"; + set_to_fail(); + $crlf = 0; + } if (defined $seen_line{$_}) { print "--> ERROR: line '$_' of $lex is repeated\n"; set_to_fail(); @@ -157,7 +180,7 @@ sub check_lexicon { } for ($n = 0; $n < $num_prob_cols; $n++) { $prob = shift @col; - if (!($prob > 0.0 && $prob <= 1.0)) { + if (!($prob > 0.0 && $prob <= 1.0)) { print "--> ERROR: bad pron-prob in lexicon-line '$_', in $lex\n"; set_to_fail(); } @@ -171,7 +194,7 @@ sub check_lexicon { foreach (0 .. @col-1) { if (!$silence{@col[$_]} and !$nonsilence{@col[$_]}) { print "--> ERROR: phone \"@col[$_]\" is not in {, non}silence.txt "; - print "(line $idx)\n"; + print "(line $idx)\n"; set_to_fail(); } } @@ -191,16 +214,22 @@ sub check_lexicon { if (-f "$dict/silprob.txt") { !open(SP, "<$dict/silprob.txt") && print "--> ERROR: fail to open $dict/silprob.txt\n" && set_to_fail(); + $crlf = 1; while () { + if ($crlf == 1 && m/\r/) { + print "--> ERROR: $dict/silprob.txt contains Carriage Return (^M) characters.\n"; + set_to_fail(); + $crlf = 0; + } chomp; my @col = split; @col != 2 && die "--> ERROR: bad line \"$_\"\n" && set_to_fail(); if ($col[0] eq "" || $col[0] eq "overall") { - if (!($col[1] > 0.0 && $col[1] <= 1.0)) { + if (!($col[1] > 0.0 && $col[1] <= 1.0)) { set_to_fail(); print "--> ERROR: bad probability in $dir/silprob.txt \"$_\"\n"; } } elsif ($col[0] eq "_s" || $col[0] eq "_n") { - if ($col[1] <= 0.0) { + if ($col[1] <= 0.0) { set_to_fail(); print "--> ERROR: bad correction term in $dir/silprob.txt \"$_\"\n"; } @@ -290,8 +319,14 @@ sub check_lexicon_pair { } $idx = 1; $success = 1; + $crlf = 1; print "--> reading $dict/extra_questions.txt\n"; while() { + if ($crlf == 1 && m/\r/) { + print "--> ERROR: $dict/extra_questions.txt contains Carriage Return (^M) characters.\n"; + set_to_fail(); + $crlf = 0; + } if (! s/\n$//) { print "--> ERROR: last line '$_' of $dict/extra_questions.txt does not end in newline.\n"; set_to_fail(); @@ -302,7 +337,7 @@ sub check_lexicon_pair { } foreach (0 .. @col-1) { if(!$silence{@col[$_]} and !$nonsilence{@col[$_]}) { - set_to_fail(); print "--> ERROR: phone \"@col[$_]\" is not in {, non}silence.txt (line $idx, block ", $_+1, ")\n"; + set_to_fail(); print "--> ERROR: phone \"@col[$_]\" is not in {, non}silence.txt (line $idx, block ", $_+1, ")\n"; } $idx ++; } @@ -336,7 +371,7 @@ sub check_lexicon_pair { $num_warn_nosplit_limit = 10; while() { my @col = split(" ", $_); - foreach $p1 (@col) { + foreach $p1 (@col) { foreach $p2 (@col) { if ($p1 ne $p2 && ! $distinguished{$p1,$p2}) { set_to_fail(); diff --git a/egs/wsj/s5/utils/validate_lang.pl b/egs/wsj/s5/utils/validate_lang.pl index 0d00379f82c..657142689ee 100755 --- a/egs/wsj/s5/utils/validate_lang.pl +++ b/egs/wsj/s5/utils/validate_lang.pl @@ -8,20 +8,28 @@ $skip_det_check = 0; +$skip_disambig_check = 0; if (@ARGV > 0 && $ARGV[0] eq "--skip-determinization-check") { $skip_det_check = 1; shift @ARGV; } +if (@ARGV > 0 && $ARGV[0] eq "--skip-disambig-check") { + $skip_disambig_check = 1; + shift @ARGV; +} + if (@ARGV != 1) { print "Usage: $0 [options] \n"; print "e.g.: $0 data/lang\n"; print "Options:\n"; print " --skip-determinization-check (this flag causes it to skip a time consuming check).\n"; + print " --skip-disambig-check (this flag causes it to skip a disambig check in phone bigram models).\n"; exit(1); } +print "$0 " . join(" ", @ARGV) . "\n"; $lang = shift @ARGV; $exit = 0; @@ -48,7 +56,7 @@ $idx ++; } close(P); -%pint2sym = (); +%pint2sym = (); foreach (keys %psymtab) { if ($pint2sym{$psymtab{$_}}) { print "--> ERROR: ID \"$psymtab{$_}\" duplicates\n"; exit 1; @@ -81,7 +89,7 @@ $idx ++; } close(W); -%wint2sym = (); +%wint2sym = (); foreach (keys %wsymtab) { if ($wint2sym{$wsymtab{$_}}) { print "--> ERROR: ID \"$wsymtab{$_}\" duplicates\n"; exit 1; @@ -89,15 +97,7 @@ $wint2sym{$wsymtab{$_}} = $_; } } -if (exists $wsymtab{"#0"}) { - print "--> $lang/words.txt has \"#0\"\n"; - print "--> $lang/words.txt is OK\n"; -} else { - $warning = 1; - print "--> WARNING: $lang/words.txt doesn't have \"#0\"\n"; - print "--> (if you are using ARPA-type language models, you will normally\n"; - print "--> need the disambiguation symbol \"#0\" to ensure determinizability)\n"; -} +print "--> $lang/words.txt is OK\n"; print "\n"; # Checking phones/* ------------------------------- @@ -113,7 +113,6 @@ sub check_txt_int_csl { if (!open(CSL, "<$cat.csl")) { $exit = 1; return print "--> ERROR: fail to open $cat.csl\n"; } - if (-z "$cat.txt") { $warning = 1; print "--> WARNING: $cat.txt is empty\n"; } @@ -172,7 +171,7 @@ sub check_txt_int_csl { close(CSL); if ($idx1 != 0) { # nonempty .txt,.int files if ($num_lines != 1) { - $exit = 1; + $exit = 1; return print "--> ERROR: expect 1 line in $cat.csl\n"; } } else { @@ -212,7 +211,7 @@ sub check_txt_int { s/ internal$//g; s/ singleton$//g; $entry[$idx1] = $_; - $idx1 ++; + $idx1 ++; } close(TXT); $idx1 --; print "--> $idx1 entry/entries in $cat.txt\n"; @@ -287,7 +286,7 @@ sub check_disjoint { if (!open(N, "<$lang/phones/nonsilence.txt")) { $exit = 1; return print "--> ERROR: fail to open $lang/phones/nonsilence.txt\n"; } - if (!open(D, "<$lang/phones/disambig.txt")) { + if (!$skip_disambig_check && !open(D, "<$lang/phones/disambig.txt")) { $exit = 1; return print "--> ERROR: fail to open $lang/phones/disambig.txt\n"; } @@ -305,7 +304,7 @@ sub check_disjoint { } close(S); - $idx = 1; + $idx = 1; while () { chomp; my @col = split(" ", $_); @@ -382,7 +381,7 @@ sub check_summation { if (scalar(keys %nonsilence) == 0) { $exit = 1; return print "--> ERROR: $lang/phones/nonsilence.txt is empty or does not exist\n"; } - if (scalar(keys %disambig) == 0) { + if (!$skip_disambig_check && scalar(keys %disambig) == 0) { $warning = 1; print "--> WARNING: $lang/phones/disambig.txt is empty or does not exist\n"; } @@ -427,8 +426,11 @@ sub check_summation { check_disjoint; print "\n"; check_summation; print "\n"; -@list1 = ("context_indep", "disambig", "nonsilence", "silence", "optional_silence"); +@list1 = ("context_indep", "nonsilence", "silence", "optional_silence"); @list2 = ("roots", "sets"); +if (!$skip_disambig_check) { + push(@list1, "disambig"); +} foreach (@list1) { check_txt_int_csl("$lang/phones/$_", \%psymtab); print "\n"; } @@ -439,14 +441,11 @@ sub check_summation { check_txt_int("$lang/phones/extra_questions", \%psymtab, 0); print "\n"; } else { print "Checking $lang/phones/extra_questions.\{txt, int\} ...\n"; - if ((-f "$lang/phones/extra_questions.txt") && (-f "$lang/phones/extra_questions.int")) { - print "--> WARNING: the optional $lang/phones/extra_questions.\{txt, int\} are empty!\n\n"; - $warning = 1; - } else { + if (!((-f "$lang/phones/extra_questions.txt") && (-f "$lang/phones/extra_questions.int"))) { print "--> ERROR: $lang/phones/extra_questions.\{txt, int\} do not exist (they may be empty, but should be present)\n\n"; $exit = 1; } -} +} if (-e "$lang/phones/word_boundary.txt") { check_txt_int("$lang/phones/word_boundary", \%psymtab, 0); print "\n"; } @@ -476,19 +475,21 @@ sub check_summation { $success == 0 || print "--> $lang/phones/optional_silence.txt is OK\n"; print "\n"; -# Check disambiguation symbols ------------------------------- -print "Checking disambiguation symbols: #0 and #1\n"; -if (scalar(keys %disambig) == 0) { - $warning = 1; print "--> WARNING: $lang/phones/disambig.txt is empty or does not exist\n"; -} -if (exists $disambig{"#0"} and exists $disambig{"#1"}) { - print "--> $lang/phones/disambig.txt has \"#0\" and \"#1\"\n"; - print "--> $lang/phones/disambig.txt is OK\n\n"; -} else { - print "--> WARNING: $lang/phones/disambig.txt doesn't have \"#0\" or \"#1\";\n"; - print "--> this would not be OK with a conventional ARPA-type language\n"; - print "--> model or a conventional lexicon (L.fst)\n"; - $warning = 1; +if (!$skip_disambig_check) { + # Check disambiguation symbols ------------------------------- + print "Checking disambiguation symbols: #0 and #1\n"; + if (scalar(keys %disambig) == 0) { + $warning = 1; print "--> WARNING: $lang/phones/disambig.txt is empty or does not exist\n"; + } + if (exists $disambig{"#0"} and exists $disambig{"#1"}) { + print "--> $lang/phones/disambig.txt has \"#0\" and \"#1\"\n"; + print "--> $lang/phones/disambig.txt is OK\n\n"; + } else { + print "--> WARNING: $lang/phones/disambig.txt doesn't have \"#0\" or \"#1\";\n"; + print "--> this would not be OK with a conventional ARPA-type language\n"; + print "--> model or a conventional lexicon (L.fst)\n"; + $warning = 1; + } } @@ -500,48 +501,46 @@ sub check_summation { if (!open(T, "<$lang/topo")) { $exit = 1; print "--> ERROR: fail to open $lang/topo\n"; } else { + $topo_ok = 1; $idx = 1; + %phones_in_topo_int_hash = ( ); + %phones_in_topo_hash = ( ); while () { chomp; next if (m/^<.*>[ ]*$/); - if ($idx == 1) { - $nonsilence_seq = $_; $idx ++; - } - if ($idx == 2) { - $silence_seq = $_; + foreach $i (split(" ", $_)) { + if (defined $phones_in_topo_int_hash{$i}) { + $topo_ok = 0; + $exit = 1; print "--> ERROR: $lang/topo has phone $i twice\n"; + } + if (!defined $pint2sym{$i}) { + $topo_ok = 0; + $exit = 1; print "--> ERROR: $lang/topo has phone $i which is not in phones.txt\n"; + } + $phones_in_topo_int_hash{$i} = 1; + $phones_in_topo_hash{$pint2sym{$i}} = 1; } } close(T); - if ($silence_seq == 0 || $nonsilence_seq == 0) { - $exit = 1; print "--> ERROR: $lang/topo doesn't have nonsilence section or silence section\n"; - } - @silence_seq = split(" ", $silence_seq); - @nonsilence_seq = split(" ", $nonsilence_seq); - $success1 = 1; - if (@nonsilence_seq != @nonsilence) { - $exit = 1; print "--> ERROR: $lang/topo's nonsilence section doesn't correspond to nonsilence.txt\n"; - } else { - foreach (0 .. scalar(@nonsilence)-1) { - if ($psymtab{@nonsilence[$_]} ne @nonsilence_seq[$_]) { - $exit = 1; print "--> ERROR: $lang/topo's nonsilence section doesn't correspond to nonsilence.txt\n"; - $success = 0; - } + $phones_that_should_be_in_topo_hash = {}; + foreach $p (@silence, @nonsilence) { $phones_that_should_be_in_topo_hash{$p} = 1; } + foreach $p (keys %phones_that_should_be_in_topo_hash) { + if ( ! defined $phones_in_topo_hash{$p}) { + $topo_ok = 0; + $i = $pint2sym{$p}; + $exit = 1; print "--> ERROR: $lang/topo does not cover phone $p (label = $i)\n"; } } - $success1 != 1 || print "--> $lang/topo's nonsilence section is OK\n"; - $success2 = 1; - if (@silence_seq != @silence) { - $exit = 1; print "--> ERROR: $lang/topo's silence section doesn't correspond to silence.txt\n"; - } else { - foreach (0 .. scalar(@silence)-1) { - if ($psymtab{@silence[$_]} ne @silence_seq[$_]) { - $exit = 1; print "--> ERROR: $lang/topo's silence section doesn't correspond to silence.txt\n"; - $success = 0; - } + foreach $i (keys %phones_in_topo_int_hash) { + $p = $pint2sym{$i}; + if ( ! defined $phones_that_should_be_in_topo_hash{$p}) { + $topo_ok = 0; + $exit = 1; print "--> ERROR: $lang/topo covers phone $p (label = $i) which is not a real phone\n"; } } - $success2 != 1 || print "--> $lang/topo's silence section is OK\n"; - $success1 != 1 or $success2 != 1 || print "--> $lang/topo is OK\n"; + if ($topo_ok) { + "--> $lang/topo is OK\n"; + } print "\n"; } @@ -606,7 +605,7 @@ sub check_summation { foreach (keys %sum) { if (!$itset{$_}) { print "$_ "; - } + } } print "\n"; } @@ -625,6 +624,80 @@ sub check_summation { print "\n"; } + + +{ + print "Checking word-level disambiguation symbols...\n"; + # This block checks that one of the two following conditions hold: + # (1) for lang diretories prepared by older versions of prepare_lang.sh: + # The symbol '#0' should appear in words.txt and phones.txt, and should + # or (2): the files wdisambig.txt, wdisambig_phones.int and wdisambig_words.int + # exist, and have the expected properties (see below for details). + + # note, %wdisambig_words_hash hashes from the integer word-id of word-level + # disambiguation symbols, to 1 if the word is a disambig symbol. + my %wdisambig_words_hash; + my %wdisambig_words_string = ""; + + if (! -e "$lang/phones/wdisambig.txt") { + print "--> no $lang/phones/wdisambig.txt (older prepare_lang.sh)\n"; + if (exists $wsymtab{"#0"}) { + print "--> $lang/words.txt has \"#0\"\n"; + $wdisambig_words_hash{$wsymtab{"#0"}} = 1; + $wdisambig_words_string = $wsymtab{"#0"}; + } else { + print "--> WARNING: $lang/words.txt doesn't have \"#0\"\n"; + print "--> (if you are using ARPA-type language models, you will normally\n"; + print "--> need the disambiguation symbol \"#0\" to ensure determinizability)\n"; + } + } else { + print "--> $lang/phones/wdisambig.txt exists (newer prepare_lang.sh)\n"; + if (!open(T, "<$lang/phones/wdisambig.txt")) { + print "--> ERROR: fail to open $lang/phones/wdisambig.txt\n"; $exit = 1; return; + } + chomp(my @wdisambig = ); + close(T); + if (!open(W, "<$lang/phones/wdisambig_words.int")) { + print "--> ERROR: fail to open $lang/phones/wdisambig_words.int\n"; $exit = 1; return; + } + chomp(my @wdisambig_words = ); + close(W); + if (!open(P, "<$lang/phones/wdisambig_phones.int")) { + print "--> ERROR: fail to open $lang/phones/wdisambig_phones.int\n"; $exit = 1; return; + } + chomp(my @wdisambig_phones =

); + close(P); + my $len = @wdisambig, $len2; + if (($len2 = @wdisambig_words) != $len) { + print "--> ERROR: files $lang/phones/wdisambig.txt and $lang/phones/wdisambig_words.int have different lengths"; + $exit = 1; return; + } + if (($len2 = @wdisambig_phones) != $len) { + print "--> ERROR: files $lang/phones/wdisambig.txt and $lang/phones/wdisambig_phones.int have different lengths"; + $exit = 1; return; + } + for (my $i = 0; $i < $len; $i++) { + if ($wsymtab{$wdisambig[$i]} ne $wdisambig_words[$i]) { + my $ii = $i + 1; + print "--> ERROR: line $ii of files $lang/phones/wdisambig.txt and $lang/phones/wdisambig_words.int mismatch\n"; + $exit = 1; return; + } + } + for (my $i = 0; $i < $len; $i++) { + if ($psymtab{$wdisambig[$i]} ne $wdisambig_phones[$i]) { + my $ii = $i + 1; + print "--> ERROR: line $ii of files $lang/phones/wdisambig.txt and $lang/phones/wdisambig_phones.int mismatch\n"; + $exit = 1; return; + } + } + foreach my $i ( @wdisambig_words ) { + $wdisambig_words_hash{$i} = 1; + $wdisambig_words_string .= " " . $i; + } + } +} + + if (-s "$lang/phones/word_boundary.int") { print "Checking word_boundary.int and disambig.int\n"; if (!open (W, "<$lang/phones/word_boundary.int")) { @@ -641,7 +714,7 @@ sub check_summation { if (!open (D, "<$lang/phones/disambig.int")) { $exit = 1; print "--> ERROR: fail to open $lang/phones/disambig.int\n"; } - while () { + while () { @A = split; if (@A != 1) { $exit = 1; print "--> ERROR: bad line $_ in $lang/phones/disambig.int\n"; @@ -657,7 +730,9 @@ sub check_summation { $wordseq_syms = ""; foreach (1 .. $wlen) { $id = int(rand(scalar(keys %wint2sym))); - while ($wint2sym{$id} =~ m/^#[0-9]*$/ or + # exclude disambiguation symbols, BOS and EOS and epsilon from the word + # sequence. + while (defined $wdisambig_words_hash{$wint2sym{$id}} or $wint2sym{$id} eq "" or $wint2sym{$id} eq "" or $id == 0) { $id = int(rand(scalar(keys %wint2sym))); } @@ -781,21 +856,17 @@ sub check_summation { } # Check that G.fst does not have cycles with only disambiguation symbols or - # epsilons on the input, or the forbidden symbols and . - $cmd = ". ./path.sh; fstprint $lang/G.fst | awk -v disambig=$lang/phones/disambig.int -v words=$lang/words.txt 'BEGIN{while((getline0) is_disambig[$1]=1; is_disambig[0] = 1; while((getline0){ if(\$1==\"\"||\$1==\"\") is_forbidden[\$2]=1;}} {if(NF<3 || is_disambig[\$3]) print; else if(is_forbidden[\$3] || is_forbidden[\$4]) { print \"Error: line \" \$0 \" in G.fst contains forbidden symbol or \" | \"cat 1>&2\"; exit(1); }}' | fstcompile | fstinfo "; - $output = `$cmd`; - if ($output !~ m/# of states\s+[1-9]/) { # fstinfo did not read a nonempty FST (there should be final probs at least)... - print "--> ERROR: failure running command to check for disambig-sym loops [possibly G.fst " . - "contained the forbidden symbols or , or possibly some other error.. Output was: \n"; - print $output; - $exit = 1; - } - if ($output !~ m/cyclic\s+n/) { # FST was cyclic after selecting only for disambig symbols. This is now allowed. - print "--> ERROR: G.fst contained cycles with only disambiguation symbols or epsilons on the input. Would cause determinization failure in graph creation.\n"; - $exit = 1; - } else { - print "--> G.fst did not contain cycles with only disambig symbols or epsilon on the input, and did not contain\n" . - "the forbidden symbols or (if present in vocab) on the input or output.\n"; + # epsilons on the input, or the forbidden symbols and (and a few + # related checks + + if (-e "$lang/G.fst") { + system("utils/lang/check_g_properties.pl $lang"); + if ($? != 0) { + print "--> ERROR: failure running check_g_properties.pl\n"; + $exit = 1; + } else { + print("--> utils/lang/check_g_properties.pl succeeded.\n"); + } } } diff --git a/egs/yesno/s5/input/task.arpabo b/egs/yesno/s5/input/task.arpabo index 415391c98bd..5c6b525b9d7 100644 --- a/egs/yesno/s5/input/task.arpabo +++ b/egs/yesno/s5/input/task.arpabo @@ -1,6 +1,6 @@ \data\ -ngram 1=3 +ngram 1=4 \1-grams: -1 NO diff --git a/egs/yesno/s5/local/prepare_lm.sh b/egs/yesno/s5/local/prepare_lm.sh index de5884d3a86..a5f5431efd3 100755 --- a/egs/yesno/s5/local/prepare_lm.sh +++ b/egs/yesno/s5/local/prepare_lm.sh @@ -1,7 +1,7 @@ #!/bin/bash . path.sh - + echo Preparing language models for test for lm_suffix in tg; do @@ -10,10 +10,10 @@ for lm_suffix in tg; do rm -rf data/lang_test_${lm_suffix} cp -r data/lang data/lang_test_${lm_suffix} - cat input/task.arpabo | arpa2fst - | fstprint | utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$test/words.txt --osymbols=$test/words.txt --keep_isymbols=false --keep_osymbols=false | fstrmepsilon | fstarcsort --sort_type=ilabel > $test/G.fst - #cat input/G.txt | utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$test/words.txt --osymbols=$test/words.txt --keep_isymbols=false --keep_osymbols=false | fstrmepsilon | fstarcsort --sort_type=ilabel > $test/G.fst + arpa2fst --disambig-symbol=#0 --read-symbol-table=$test/words.txt input/task.arpabo $test/G.fst + fstisstochastic $test/G.fst - + # The output is like: # 9.14233e-05 -0.259833 # we do expect the first of these 2 numbers to be close to zero (the second is @@ -30,7 +30,7 @@ for lm_suffix in tg; do < data/local/dict/lexicon.txt >tmpdir.g/select_empty.fst.txt fstcompile --isymbols=$test/words.txt --osymbols=$test/words.txt tmpdir.g/select_empty.fst.txt | \ fstarcsort --sort_type=olabel | fstcompose - $test/G.fst > tmpdir.g/empty_words.fst - fstinfo tmpdir.g/empty_words.fst | grep cyclic | grep -w 'y' && + fstinfo tmpdir.g/empty_words.fst | grep cyclic | grep -w 'y' && echo "Language model has cycles with empty words" && exit 1 rm -r tmpdir.g done diff --git a/egs/yesno/s5/path.sh b/egs/yesno/s5/path.sh index 708524a5587..21bfd1440fa 100644 --- a/egs/yesno/s5/path.sh +++ b/egs/yesno/s5/path.sh @@ -1,3 +1,8 @@ - -export PATH=$PWD/utils/:$PWD/../../../src/bin:$PWD/../../../tools/openfst/bin:$PWD/../../../src/fstbin/:$PWD/../../../src/gmmbin/:$PWD/../../../src/featbin/:$PWD/../../../src/lm/:$PWD/../../../src/sgmmbin/:$PWD/../../../src/fgmmbin/:$PWD/../../../src/latbin/:$PWD:$PATH +export KALDI_ROOT=`pwd`/../../.. +[ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh +export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH +[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1 +. $KALDI_ROOT/tools/config/common_path.sh export LC_ALL=C + + diff --git a/egs/yesno/s5/run.sh b/egs/yesno/s5/run.sh index 3e5d59a9656..12b00273f8b 100755 --- a/egs/yesno/s5/run.sh +++ b/egs/yesno/s5/run.sh @@ -26,6 +26,7 @@ local/prepare_lm.sh for x in train_yesno test_yesno; do steps/make_mfcc.sh --nj 1 data/$x exp/make_mfcc/$x mfcc steps/compute_cmvn_stats.sh data/$x exp/make_mfcc/$x mfcc + utils/fix_data_dir.sh data/$x done # Mono training diff --git a/misc/maintenance/find_missing_dependencies.sh b/misc/maintenance/find_missing_dependencies.sh index 3854dd5ceaa..55e300fe9f0 100755 --- a/misc/maintenance/find_missing_dependencies.sh +++ b/misc/maintenance/find_missing_dependencies.sh @@ -1,5 +1,7 @@ #!/bin/bash +echo "$0: finding missing inter-directory dependencies in src/Makefile" + cd src for x in */Makefile; do @@ -9,4 +11,4 @@ for x in */Makefile; do echo "$dir: $dependency"; fi done -done \ No newline at end of file +done diff --git a/misc/maintenance/fix_apache_headers.sh b/misc/maintenance/fix_apache_headers.sh index 8653bdf6457..7fb813b2624 100755 --- a/misc/maintenance/fix_apache_headers.sh +++ b/misc/maintenance/fix_apache_headers.sh @@ -4,6 +4,11 @@ # authors appears in the apache headers in the source, and that source files # have their Apache headers. Including this mainly for documentation, as I # doubt the issue will occur much in future. +# +# Also makes sure that where the filename appears in a comment at the top of the +# file, e.g. as in +# // somedir/some-file.cc +# the filename is accurate. # run this from the top level of the repo, as # misc/maintenance/fix_apache_headers.sh @@ -11,17 +16,31 @@ set -e cd src rm -rf tmp -for x in */*.{h,cc,dox}; do +for x in */*.{h,cc,dox}; do if [ $x != "util/basic-filebuf.h" ]; then if ! grep 'COPYING for clarification' $x >/dev/null; then echo Fixing $x; if ! grep "Apache License" $x >/dev/null; then echo "$0: warning: file $x may not have an Apache license header" else - cp $x tmp; cat tmp | perl -ape ' if (m/Licensed under the Apache License/) { - print "// See ../../COPYING for clarification regarding multiple authors\n"; + cp $x tmp; cat tmp | perl -ape ' if (m/Licensed under the Apache License/) { + print "// See ../../COPYING for clarification regarding multiple authors\n"; print "//\n";} ' > $x; fi fi fi done + +for x in */*.{h,cc,dox}; do + if [ $x != "util/basic-filebuf.h" ]; then + echo "// $x" | cat - <(tail -n +2 $x) >tmp + if ! diff tmp $x; then + if head -n 1 $x | grep -E '// [-a-z0-9_]+/[-a-z0-9_.]+$'; then + echo "Fixing $x automatically" + cp tmp $x + else + echo "**Please fix $x manually" + fi + fi + fi +done diff --git a/misc/maintenance/fix_include_guards.sh b/misc/maintenance/fix_include_guards.sh index b1338371a78..dde5e6cf155 100755 --- a/misc/maintenance/fix_include_guards.sh +++ b/misc/maintenance/fix_include_guards.sh @@ -8,13 +8,13 @@ set -e cd src rm -rf tmp -for x in */*.h ; do +for x in */*.h ; do name=`echo $x | tr '[a-z]/.-' '[A-Z]___' ` - m=KALDI_${name}_ + m=KALDI_${name}_ n=`grep ifndef $x | awk '{print $2}' | head -n 1` - if [ "$m" != "$n" ]; then - echo "$m != $n"; - if [ ! -z "$n" ]; then + if [ "$m" != "$n" ]; then + echo "$m != $n"; + if [ ! -z "$n" ]; then cp $x tmp; sed s/$n/$m/ $x; else echo "Something wrong for file $x, maybe no include guard." @@ -23,3 +23,12 @@ for x in */*.h ; do done +for x in */*.h ; do + name=`echo $x | tr '[a-z]/.-' '[A-Z]___' ` + m=KALDI_${name}_ + n=`grep endif $x | grep _H_ | sed s://:: | awk '{print $2}' | head -n 1` + if [ ! -s $n ] && [ "$m" != "$n" ]; then + echo "#endif: $m != $n"; + cp $x tmp; sed s/$n/$m/ $x; + fi +done diff --git a/src/Makefile b/src/Makefile index 260879c788b..c10bb518e9d 100644 --- a/src/Makefile +++ b/src/Makefile @@ -8,11 +8,11 @@ SHELL := /bin/bash SUBDIRS = base matrix util feat tree thread gmm transform sgmm \ fstext hmm lm decoder lat kws cudamatrix nnet \ bin fstbin gmmbin fgmmbin sgmmbin featbin \ - nnetbin latbin sgmm2 sgmm2bin nnet2 nnet3 nnet3bin nnet2bin kwsbin \ - ivector ivectorbin online2 online2bin lmbin + nnetbin latbin sgmm2 sgmm2bin nnet2 nnet3 chain nnet3bin nnet2bin kwsbin \ + ivector ivectorbin online2 online2bin lmbin chainbin MEMTESTDIRS = base matrix util feat tree thread gmm transform sgmm \ - fstext hmm lm decoder lat nnet \ + fstext hmm lm decoder lat nnet kws chain \ bin fstbin gmmbin fgmmbin sgmmbin featbin \ nnetbin latbin sgmm2 nnet2 nnet3 nnet2bin nnet3bin sgmm2bin kwsbin \ ivector ivectorbin online2 online2bin lmbin @@ -111,7 +111,7 @@ ext_test: $(addsuffix /test, $(EXT_SUBDIRS)) # Define an implicit rule, expands to e.g.: # base/test: base -# $(MAKE) -C base test +# $(MAKE) -C base test %/test: % mklibdir $(MAKE) -C $< test @@ -134,7 +134,7 @@ ext_depend: check_portaudio .PHONY: $(SUBDIRS) $(SUBDIRS) : mklibdir - $(MAKE) -C $@ + $(MAKE) -C $@ .PHONY: $(EXT_SUBDIRS) $(EXT_SUBDIRS) : mklibdir @@ -145,37 +145,37 @@ $(EXT_SUBDIRS) : mklibdir # this is necessary for correct parallel compilation #1)The tools depend on all the libraries -bin fstbin gmmbin fgmmbin sgmmbin sgmm2bin featbin nnetbin nnet2bin nnet3bin latbin ivectorbin lmbin: \ +bin fstbin gmmbin fgmmbin sgmmbin sgmm2bin featbin nnetbin nnet2bin nnet3bin chainbin latbin ivectorbin lmbin kwsbin online2bin: \ base matrix util feat tree optimization thread gmm transform sgmm sgmm2 fstext hmm \ - lm decoder lat cudamatrix nnet nnet2 nnet3 ivector + lm decoder lat cudamatrix nnet nnet2 nnet3 ivector chain kws online2 #2)The libraries have inter-dependencies base: matrix : base -util: base matrix -thread: util matrix base +thread : base +util: base matrix thread feat: base matrix util gmm transform tree thread -tree: base util matrix +tree: base util thread matrix optimization: base matrix gmm: base util matrix tree thread transform: base util matrix gmm tree thread sgmm: base util matrix gmm tree transform thread hmm sgmm2: base util matrix gmm tree transform thread hmm -fstext: base util matrix tree -hmm: base tree matrix util -lm: base util fstext -decoder: base util matrix gmm sgmm hmm tree transform lat -lat: base util hmm tree matrix -cudamatrix: base util matrix -nnet: base util matrix cudamatrix +fstext: base util thread matrix tree +hmm: base tree matrix util thread +lm: base util thread matrix fstext +decoder: base util thread matrix gmm sgmm hmm tree transform lat +lat: base util thread hmm tree matrix +cudamatrix: base util thread matrix +nnet: base util thread matrix cudamatrix nnet2: base util matrix thread lat gmm hmm tree transform cudamatrix -nnet3: base util matrix thread lat gmm hmm tree transform cudamatrix -ivector: base util matrix thread transform tree gmm +nnet3: base util matrix thread lat gmm hmm tree transform cudamatrix chain fstext +chain: lat hmm tree fstext matrix cudamatrix util thread base +ivector: base util matrix thread transform tree gmm #3)Dependencies for optional parts of Kaldi onlinebin: base matrix util feat tree optimization gmm transform sgmm sgmm2 fstext hmm lm decoder lat cudamatrix nnet nnet2 online thread -online2bin: base matrix util feat tree optimization gmm transform sgmm sgmm2 fstext hmm lm decoder lat cudamatrix nnet nnet2 online2 thread ivector # python-kaldi-decoding: base matrix util feat tree optimization thread gmm transform sgmm sgmm2 fstext hmm decoder lat online online: decoder gmm transform feat matrix util base lat hmm thread tree -online2: decoder gmm transform feat matrix util base lat hmm thread ivector cudamatrix nnet2 -kws: base util hmm tree matrix lat -kwsbin: fstext kws lat base util hmm tree matrix +online2: decoder gmm transform feat matrix util base lat hmm thread ivector cudamatrix nnet2 nnet3 chain +kws: base util thread hmm tree matrix lat + diff --git a/src/base/io-funcs-inl.h b/src/base/io-funcs-inl.h index e55458ed43c..6b87f4c1a24 100644 --- a/src/base/io-funcs-inl.h +++ b/src/base/io-funcs-inl.h @@ -1,7 +1,9 @@ // base/io-funcs-inl.h // Copyright 2009-2011 Microsoft Corporation; Saarland University; -// Jan Silovsky; Yanmin Qian; Johns Hopkins University (Author: Daniel Povey) +// Jan Silovsky; Yanmin Qian; +// Johns Hopkins University (Author: Daniel Povey) +// 2016 Xiaohui Zhang // See ../../COPYING for clarification regarding multiple authors // @@ -62,7 +64,6 @@ template inline void ReadBasicType(std::istream &is, char len_c = static_cast(len_c_in), len_c_expected = (std::numeric_limits::is_signed ? 1 : -1) * static_cast(sizeof(*t)); - if (len_c != len_c_expected) { KALDI_ERR << "ReadBasicType: did not get expected integer type, " << static_cast(len_c) @@ -87,6 +88,112 @@ template inline void ReadBasicType(std::istream &is, } } +// Template that covers integers. +template +inline void WriteIntegerPairVector(std::ostream &os, bool binary, + const std::vector > &v) { + // Compile time assertion that this is not called with a wrong type. + KALDI_ASSERT_IS_INTEGER_TYPE(T); + if (binary) { + char sz = sizeof(T); // this is currently just a check. + os.write(&sz, 1); + int32 vecsz = static_cast(v.size()); + KALDI_ASSERT((size_t)vecsz == v.size()); + os.write(reinterpret_cast(&vecsz), sizeof(vecsz)); + if (vecsz != 0) { + os.write(reinterpret_cast(&(v[0])), sizeof(T) * vecsz * 2); + } + } else { + // focus here is on prettiness of text form rather than + // efficiency of reading-in. + // reading-in is dominated by low-level operations anyway: + // for efficiency use binary. + os << "[ "; + typename std::vector >::const_iterator iter = v.begin(), + end = v.end(); + for (; iter != end; ++iter) { + if (sizeof(T) == 1) + os << static_cast(iter->first) << ',' + << static_cast(iter->second) << ' '; + else + os << iter->first << ',' + << iter->second << ' '; + } + os << "]\n"; + } + if (os.fail()) { + throw std::runtime_error("Write failure in WriteIntegerPairVector."); + } +} + +// Template that covers integers. +template +inline void ReadIntegerPairVector(std::istream &is, bool binary, + std::vector > *v) { + KALDI_ASSERT_IS_INTEGER_TYPE(T); + KALDI_ASSERT(v != NULL); + if (binary) { + int sz = is.peek(); + if (sz == sizeof(T)) { + is.get(); + } else { // this is currently just a check. + KALDI_ERR << "ReadIntegerPairVector: expected to see type of size " + << sizeof(T) << ", saw instead " << sz << ", at file position " + << is.tellg(); + } + int32 vecsz; + is.read(reinterpret_cast(&vecsz), sizeof(vecsz)); + if (is.fail() || vecsz < 0) goto bad; + v->resize(vecsz); + if (vecsz > 0) { + is.read(reinterpret_cast(&((*v)[0])), sizeof(T)*vecsz*2); + } + } else { + std::vector > tmp_v; // use temporary so v doesn't use extra memory + // due to resizing. + is >> std::ws; + if (is.peek() != static_cast('[')) { + KALDI_ERR << "ReadIntegerPairVector: expected to see [, saw " + << is.peek() << ", at file position " << is.tellg(); + } + is.get(); // consume the '['. + is >> std::ws; // consume whitespace. + while (is.peek() != static_cast(']')) { + if (sizeof(T) == 1) { // read/write chars as numbers. + int16 next_t1, next_t2; + is >> next_t1; + if (is.fail()) goto bad; + if (is.peek() != static_cast(',')) + KALDI_ERR << "ReadIntegerPairVector: expected to see ',', saw " + << is.peek() << ", at file position " << is.tellg(); + is.get(); // consume the ','. + is >> next_t2 >> std::ws; + if (is.fail()) goto bad; + else + tmp_v.push_back(std::make_pair((T)next_t1, (T)next_t2)); + } else { + T next_t1, next_t2; + is >> next_t1; + if (is.fail()) goto bad; + if (is.peek() != static_cast(',')) + KALDI_ERR << "ReadIntegerPairVector: expected to see ',', saw " + << is.peek() << ", at file position " << is.tellg(); + is.get(); // consume the ','. + is >> next_t2 >> std::ws; + if (is.fail()) goto bad; + else + tmp_v.push_back(std::pair(next_t1, next_t2)); + } + } + is.get(); // get the final ']'. + *v = tmp_v; // could use std::swap to use less temporary memory, but this + // uses less permanent memory. + } + if (!is.fail()) return; + bad: + KALDI_ERR << "ReadIntegerPairVector: read failure at file position " + << is.tellg(); +} template inline void WriteIntegerVector(std::ostream &os, bool binary, const std::vector &v) { @@ -117,7 +224,7 @@ template inline void WriteIntegerVector(std::ostream &os, bool binary, os << "]\n"; } if (os.fail()) { - throw std::runtime_error("Write failure in WriteIntegerType."); + throw std::runtime_error("Write failure in WriteIntegerVector."); } } @@ -178,6 +285,7 @@ template inline void ReadIntegerVector(std::istream &is, << is.tellg(); } + // Initialize an opened stream for writing by writing an optional binary // header and modifying the floating-point precision. inline void InitKaldiOutputStream(std::ostream &os, bool binary) { diff --git a/src/base/io-funcs-test.cc b/src/base/io-funcs-test.cc index 36a9e1e5f3f..dd05326d5ed 100644 --- a/src/base/io-funcs-test.cc +++ b/src/base/io-funcs-test.cc @@ -43,8 +43,20 @@ void UnitTestIo(bool binary) { WriteIntegerVector(outfile, binary, vec2); if (!binary) outfile << " \n"; std::vector vec3; - for (size_t i = 0; i < 10; i++) vec3.push_back(Rand()%100); + + int32 size = RandInt(0, 10); + for (size_t i = 0; i < size; i++) vec3.push_back(Rand()%100); WriteIntegerVector(outfile, binary, vec3); + std::vector > vec4; + WriteIntegerPairVector(outfile, binary, vec4); + if (!binary && Rand()%2 == 0) outfile << " \n"; + std::vector > vec5; + for (size_t i = 0; i < size; i++) vec5.push_back(std::make_pair(Rand()%100 - 10, Rand()%100 - 10)); + WriteIntegerPairVector(outfile, binary, vec5); + if (!binary) outfile << " \n"; + std::vector > vec6; + for (size_t i = 0; i < size; i++) vec6.push_back(std::make_pair(Rand()%100, Rand()%100)); + WriteIntegerPairVector(outfile, binary, vec6); if (!binary && Rand()%2 == 0) outfile << " \n"; const char *token1 = "Hi"; WriteToken(outfile, binary, token1); @@ -90,9 +102,19 @@ void UnitTestIo(bool binary) { std::vector vec3_in; ReadIntegerVector(infile, binary_in, &vec3_in); KALDI_ASSERT(vec3_in == vec3); + std::vector > vec4_in; + ReadIntegerPairVector(infile, binary_in, &vec4_in); + KALDI_ASSERT(vec4_in == vec4); + std::vector > vec5_in; + ReadIntegerPairVector(infile, binary_in, &vec5_in); + KALDI_ASSERT(vec5_in == vec5); + std::vector > vec6_in; + ReadIntegerPairVector(infile, binary_in, &vec6_in); + KALDI_ASSERT(vec6_in == vec6); std::string token1_in, token2_in; KALDI_ASSERT(Peek(infile, binary_in) == static_cast(*token1)); - KALDI_ASSERT(PeekToken(infile, binary_in) == (int)*token1); // Note: + KALDI_ASSERT(PeekToken(infile, binary_in) == static_cast(*token1)); + // Note: // the stuff with skipping over '<' is tested in ../util/kaldi-io-test.cc, // since we need to make sure it works with pipes. ReadToken(infile, binary_in, &token1_in); @@ -132,7 +154,7 @@ int main() { UnitTestIo(false); UnitTestIo(true); } - KALDI_ASSERT(1); // just wanted to check that KALDI_ASSERT does not fail for 1. + KALDI_ASSERT(1); // just to check that KALDI_ASSERT does not fail for 1. return 0; } diff --git a/src/base/io-funcs.h b/src/base/io-funcs.h index 2bc9da895d4..4caddc6b5b3 100644 --- a/src/base/io-funcs.h +++ b/src/base/io-funcs.h @@ -2,6 +2,7 @@ // Copyright 2009-2011 Microsoft Corporation; Saarland University; // Jan Silovsky; Yanmin Qian +// 2016 Xiaohui Zhang // See ../../COPYING for clarification regarding multiple authors // @@ -98,7 +99,6 @@ namespace kaldi { void ReadToken(std::istream &is, bool binary, std::string *str); void PeekToken(std::istream &is, bool binary, std::string *str); - WriteToken writes the token and one space (whether in binary or text mode). Peek returns the first character of the next token, by consuming whitespace @@ -182,6 +182,16 @@ template inline void WriteIntegerVector(std::ostream &os, bool binary, template inline void ReadIntegerVector(std::istream &is, bool binary, std::vector *v); +/// Function for writing STL vectors of pairs of integer types. +template +inline void WriteIntegerPairVector(std::ostream &os, bool binary, + const std::vector > &v); + +/// Function for reading STL vector of pairs of integer types. +template +inline void ReadIntegerPairVector(std::istream &is, bool binary, + std::vector > *v); + /// The WriteToken functions are for writing nonempty sequences of non-space /// characters. They are not for general strings. void WriteToken(std::ostream &os, bool binary, const char *token); diff --git a/src/base/kaldi-common.h b/src/base/kaldi-common.h index 33f6f314db4..e0002d91bb7 100644 --- a/src/base/kaldi-common.h +++ b/src/base/kaldi-common.h @@ -28,8 +28,8 @@ #include #include #include -#include -#include +#include +#include #include "base/kaldi-utils.h" #include "base/kaldi-error.h" @@ -38,4 +38,3 @@ #include "base/kaldi-math.h" #endif // KALDI_BASE_KALDI_COMMON_H_ - diff --git a/src/base/kaldi-error-test.cc b/src/base/kaldi-error-test.cc index 20301e2702f..527de852cac 100644 --- a/src/base/kaldi-error-test.cc +++ b/src/base/kaldi-error-test.cc @@ -46,7 +46,8 @@ int main() { try { kaldi::UnitTestError(); KALDI_ASSERT(0); // should not happen. - } catch (std::runtime_error &r) { + exit(1); + } catch(std::runtime_error &r) { std::cout << "UnitTestError: the error we generated was: " << r.what(); } } diff --git a/src/base/kaldi-error.cc b/src/base/kaldi-error.cc index 96349e17742..5ca884e996f 100644 --- a/src/base/kaldi-error.cc +++ b/src/base/kaldi-error.cc @@ -1,5 +1,6 @@ // base/kaldi-error.cc +// Copyright 2016 Brno University of Technology (author: Karel Vesely) // Copyright 2009-2011 Microsoft Corporation; Lukas Burget; Ondrej Glembek // See ../../COPYING for clarification regarding multiple authors @@ -20,8 +21,8 @@ #ifdef HAVE_EXECINFO_H #include // To get stack trace in error messages. // If this #include fails there is an error in the Makefile, it does not -// support your platform well. Make sure HAVE_EXECINFO_H is undefined, and the -// code will compile. +// support your platform well. Make sure HAVE_EXECINFO_H is undefined, +// and the code will compile. #ifdef HAVE_CXXABI_H #include // For name demangling. // Useful to decode the stack trace, but only used if we have execinfo.h @@ -32,24 +33,31 @@ #include "base/kaldi-error.h" namespace kaldi { -int32 g_kaldi_verbose_level = 0; // Just initialize this global variable. + +/***** GLOBAL VARIABLES FOR LOGGING *****/ + +int32 g_kaldi_verbose_level = 0; const char *g_program_name = NULL; +static LogHandler g_log_handler = NULL; // If the program name was set (g_program_name != ""), the function // GetProgramName returns the program name (without the path) followed by a // colon, e.g. "gmm-align:". Otherwise it returns the empty string "". const char *GetProgramName() { - if (g_program_name == NULL) return ""; - else return g_program_name; + return g_program_name == NULL ? "" : g_program_name; } + +/***** HELPER FUNCTIONS *****/ + // Given a filename like "/a/b/c/d/e/f.cc", GetShortFileName // returns "e/f.cc". Does not currently work if backslash is // the filename separator. -const char *GetShortFileName(const char *filename) { +static const char *GetShortFileName(const char *filename) { const char *last_slash = strrchr(filename, '/'); - if (!last_slash) { return filename; } - else { + if (!last_slash) { + return filename; + } else { while (last_slash > filename && last_slash[-1] != '/') last_slash--; return last_slash; @@ -57,133 +65,180 @@ const char *GetShortFileName(const char *filename) { } +/***** STACKTRACE *****/ + +static std::string Demangle(std::string trace_name) { #if defined(HAVE_CXXABI_H) && defined(HAVE_EXECINFO_H) -// The function name looks like a macro: it's a macro if we don't have ccxxabi.h -inline void KALDI_APPEND_POSSIBLY_DEMANGLED_STRING(std::string &ans, - const char *to_append) { - // at input the string "to_append" looks like: + // at input the string looks like: // ./kaldi-error-test(_ZN5kaldi13UnitTestErrorEv+0xb) [0x804965d] // We want to extract the name e.g. '_ZN5kaldi13UnitTestErrorEv", // demangle it and return it. - int32 status; - const char *paren = strchr(to_append, '('); - const char *plus = (paren ? strchr(paren, '+') : NULL); - if (!plus) { // did not find the '(' or did not find the '+' - // This is a soft failure in case we did not get what we expected. - ans += to_append; - return; + + // try to locate '(' and '+', take the string in between, + size_t begin(trace_name.find("(")), + end(trace_name.rfind("+")); + if (begin != std::string::npos && end != std::string::npos && begin < end) { + trace_name = trace_name.substr(begin+1,end-(begin+1)); } - std::string stripped(paren+1, plus-(paren+1)); // the bit between ( and +. - - char *demangled_name = abi::__cxa_demangle(stripped.c_str(), 0, 0, &status); - - // if status != 0 it is an error (demangling failure), but not all names seem - // to demangle, so we don't check it. - - if (demangled_name != NULL) { - ans += demangled_name; + // demangle, + int status; + char *demangled_name = abi::__cxa_demangle(trace_name.c_str(), 0, 0, &status); + std::string ans; + if (status == 0) { + ans = demangled_name; free(demangled_name); } else { - ans += to_append; // add the original string. + ans = trace_name; } + // return, + return ans; +#else + return trace_name; +#endif } -#else // defined(HAVE_CXXABI_H) && defined(HAVE_EXECINFO_H) -#define KALDI_APPEND_POSSIBLY_DEMANGLED_STRING(ans, to_append) ans += to_append -#endif // defined(HAVE_CXXABI_H) && defined(HAVE_EXECINFO_H) + +static std::string KaldiGetStackTrace() { + std::string ans; #ifdef HAVE_EXECINFO_H -std::string KaldiGetStackTrace() { #define KALDI_MAX_TRACE_SIZE 50 #define KALDI_MAX_TRACE_PRINT 20 // must be even. - std::string ans; - void *array[KALDI_MAX_TRACE_SIZE]; - size_t size = backtrace(array, KALDI_MAX_TRACE_SIZE); - char **strings = backtrace_symbols(array, size); + // buffer for the trace, + void *trace[KALDI_MAX_TRACE_SIZE]; + // get the trace, + size_t size = backtrace(trace, KALDI_MAX_TRACE_SIZE); + // get the trace symbols, + char **trace_symbol = backtrace_symbols(trace, size); + + // Compose the 'string', + ans += "[ Stack-Trace: ]\n"; if (size <= KALDI_MAX_TRACE_PRINT) { for (size_t i = 0; i < size; i++) { - KALDI_APPEND_POSSIBLY_DEMANGLED_STRING(ans, strings[i]); - ans += "\n"; + ans += Demangle(trace_symbol[i]) + "\n"; } } else { // print out first+last (e.g.) 5. for (size_t i = 0; i < KALDI_MAX_TRACE_PRINT/2; i++) { - KALDI_APPEND_POSSIBLY_DEMANGLED_STRING(ans, strings[i]); - ans += "\n"; + ans += Demangle(trace_symbol[i]) + "\n"; } ans += ".\n.\n.\n"; for (size_t i = size - KALDI_MAX_TRACE_PRINT/2; i < size; i++) { - KALDI_APPEND_POSSIBLY_DEMANGLED_STRING(ans, strings[i]); - ans += "\n"; + ans += Demangle(trace_symbol[i]) + "\n"; } if (size == KALDI_MAX_TRACE_SIZE) ans += ".\n.\n.\n"; // stack was too long, probably a bug. } - free(strings); // it's all in one big malloc()ed block. - -#ifdef HAVE_CXXABI_H // demangle the name, if possible. -#endif // HAVE_CXXABI_H + // cleanup, + free(trace_symbol); // it's okay, just the pointers, not the strings. +#endif // HAVE_EXECINFO_H return ans; } -#endif -void KaldiAssertFailure_(const char *func, const char *file, - int32 line, const char *cond_str) { - std::ostringstream ss; - ss << "KALDI_ASSERT: at " << GetProgramName() << func << ':' - << GetShortFileName(file) - << ':' << line << ", failed: " << cond_str << '\n'; -#ifdef HAVE_EXECINFO_H - ss << "Stack trace is:\n" << KaldiGetStackTrace(); -#endif - std::cerr << ss.str(); - std::cerr.flush(); - // We used to call abort() here, but switch to throwing an exception - // (like KALDI_ERR) because it's easier to deal with in multi-threaded - // code. - throw std::runtime_error(ss.str()); -} +/***** KALDI LOGIGNG *****/ -KaldiWarnMessage::KaldiWarnMessage(const char *func, const char *file, - int32 line) { - this->stream() << "WARNING (" << GetProgramName() << func << "():" - << GetShortFileName(file) << ':' << line << ") "; +MessageLogger::MessageLogger(LogMessageEnvelope::Severity severity, + const char *func, const char *file, int32 line) { + // Obviously, we assume the strings survive the destruction of this object. + envelope_.severity = severity; + envelope_.func = func; + envelope_.file = GetShortFileName(file); // Pointer inside 'file'. + envelope_.line = line; } -KaldiLogMessage::KaldiLogMessage(const char *func, const char *file, - int32 line) { - this->stream() << "LOG (" << GetProgramName() << func << "():" - << GetShortFileName(file) << ':' << line << ") "; +MessageLogger::~MessageLogger() KALDI_NOEXCEPT(false) { + // remove trailing '\n', + std::string str = ss_.str(); + while (!str.empty() && str[str.length() - 1] == '\n') + str.resize(str.length() - 1); + + // print the mesage (or send to logging handler), + MessageLogger::HandleMessage(envelope_, str.c_str()); } -KaldiVlogMessage::KaldiVlogMessage(const char *func, const char *file, - int32 line, int32 verbose) { - this->stream() << "VLOG[" << verbose << "] (" << GetProgramName() << func - << "():" << GetShortFileName(file) << ':' << line << ") "; +void MessageLogger::HandleMessage(const LogMessageEnvelope &envelope, + const char *message) { + // Send to a logging handler if provided. + if (g_log_handler != NULL) { + g_log_handler(envelope, message); + } else { + // Otherwise, we use the default Kaldi logging. + // Build the log-message 'header', + std::stringstream header; + if (envelope.severity > LogMessageEnvelope::kInfo) { + header << "VLOG[" << envelope.severity << "] ("; + } else { + switch (envelope.severity) { + case LogMessageEnvelope::kInfo : + header << "LOG ("; + break; + case LogMessageEnvelope::kWarning : + header << "WARNING ("; + break; + case LogMessageEnvelope::kError : + header << "ERROR ("; + break; + case LogMessageEnvelope::kAssertFailed : + header << "ASSERTION_FAILED ("; + break; + default: + abort(); // coding errror (unknown 'severity'), + } + } + // fill the other info from the envelope, + header << GetProgramName() << envelope.func << "():" + << envelope.file << ':' << envelope.line << ")"; + + // Printing the message, + if (envelope.severity >= LogMessageEnvelope::kWarning) { + // VLOG, LOG, WARNING: + fprintf(stderr, "%s %s\n", header.str().c_str(), message); + } else { + // ERROR, ASSERT_FAILED (print with stack-trace): + fprintf(stderr, "%s %s\n\n%s\n", header.str().c_str(), message, + KaldiGetStackTrace().c_str()); + } + } + + // Should we throw exception, or abort? + switch (envelope.severity) { + case LogMessageEnvelope::kAssertFailed: + abort(); // ASSERT_FAILED, + break; + case LogMessageEnvelope::kError: + if (!std::uncaught_exception()) { + // throw exception with empty message, + throw std::runtime_error(""); // KALDI_ERR, + } else { + // If we got here, this thread has already thrown exception, + // and this exception has not yet arrived to its 'catch' clause... + // Throwing a new exception would be unsafe! + // (can happen during 'stack unwinding', if we have 'KALDI_ERR << msg' + // in a destructor of some local object). + abort(); + } + break; + } } -KaldiErrorMessage::KaldiErrorMessage(const char *func, const char *file, - int32 line) { - this->stream() << "ERROR (" << GetProgramName() << func << "():" - << GetShortFileName(file) << ':' << line << ") "; + +/***** KALDI ASSERTS *****/ + +void KaldiAssertFailure_(const char *func, const char *file, + int32 line, const char *cond_str) { + MessageLogger ml(LogMessageEnvelope::kAssertFailed, func, file, line); + ml.stream() << ": '" << cond_str << "' "; } -KaldiErrorMessage::~KaldiErrorMessage() KALDI_NOEXCEPT(false) { - // (1) Print the message to stderr. - std::cerr << ss.str() << '\n'; - // (2) Throw an exception with the message, plus traceback info if available. - if (!std::uncaught_exception()) { -#ifdef HAVE_EXECINFO_H - throw std::runtime_error(ss.str() + "\n\n[stack trace: ]\n" + - KaldiGetStackTrace() + "\n"); -#else - throw std::runtime_error(ss.str()); -#endif - } else { - abort(); // This may be temporary... - } + +/***** THIRD-PARTY LOG-HANDLER *****/ + +LogHandler SetLogHandler(LogHandler new_handler) { + LogHandler old_handler = g_log_handler; + g_log_handler = new_handler; + return old_handler; } } // end namespace kaldi diff --git a/src/base/kaldi-error.h b/src/base/kaldi-error.h index 6de7eeea775..2911036d1b7 100644 --- a/src/base/kaldi-error.h +++ b/src/base/kaldi-error.h @@ -1,5 +1,6 @@ // base/kaldi-error.h +// Copyright 2016 Brno University of Technology (author: Karel Vesely) // Copyright 2009-2011 Microsoft Corporation; Ondrej Glembek; Lukas Burget; // Saarland University @@ -21,33 +22,40 @@ #ifndef KALDI_BASE_KALDI_ERROR_H_ #define KALDI_BASE_KALDI_ERROR_H_ 1 -#include -#include +#include #include #include -#include +#include +#include +#include "base/kaldi-types.h" +#include "base/kaldi-utils.h" +/* Important that this file does not depend on any other kaldi headers. */ + +// By adding 'KALDI_NOEXCEPT(bool)' immediately after function declaration, +// we can tell the compiler that the function must-not produce +// exceptions (true), or may produce exceptions (false): #if _MSC_VER >= 1900 || (!defined(_MSC_VER) && __cplusplus >= 201103L) #define KALDI_NOEXCEPT(Predicate) noexcept((Predicate)) #elif defined(__GXX_EXPERIMENTAL_CXX0X__) && \ - (__GNUC__ >= 4 && __GNUC_MINOR__ >= 6) + (__GNUC__ >= 4 && __GNUC_MINOR__ >= 6) #define KALDI_NOEXCEPT(Predicate) noexcept((Predicate)) #else #define KALDI_NOEXCEPT(Predicate) #endif -#include "base/kaldi-types.h" -#include "base/kaldi-utils.h" - -/* Important that this file does not depend on any other kaldi headers. */ - +#ifdef _MSC_VER +#define __func__ __FUNCTION__ +#endif namespace kaldi { /// \addtogroup error_group /// @{ -/// This is set by util/parse-options.{h, cc} if you set --verbose = ? option +/***** VERBOSITY LEVEL *****/ + +/// This is set by util/parse-options.{h, cc} if you set --verbose=? option. extern int32 g_kaldi_verbose_level; /// This is set by util/parse-options.{h, cc} (from argv[0]) and used (if set) @@ -63,64 +71,82 @@ inline int32 GetVerboseLevel() { return g_kaldi_verbose_level; } /// automatically from ParseOptions. inline void SetVerboseLevel(int32 i) { g_kaldi_verbose_level = i; } -// Class KaldiLogMessage is invoked from the KALDI_WARN, KALDI_VLOG and -// KALDI_LOG macros. It prints the message to stderr. Note: we avoid -// using cerr, due to problems with thread safety. fprintf is guaranteed -// thread-safe. - -// class KaldiWarnMessage is invoked from the KALDI_WARN macro. -class KaldiWarnMessage { - public: - inline std::ostream &stream() { return ss; } - KaldiWarnMessage(const char *func, const char *file, int32 line); - ~KaldiWarnMessage() { fprintf(stderr, "%s\n", ss.str().c_str()); } - private: - std::ostringstream ss; -}; -// class KaldiLogMessage is invoked from the KALDI_LOG macro. -class KaldiLogMessage { - public: - inline std::ostream &stream() { return ss; } - KaldiLogMessage(const char *func, const char *file, int32 line); - ~KaldiLogMessage() { fprintf(stderr, "%s\n", ss.str().c_str()); } - private: - std::ostringstream ss; +/***** KALDI LOGGING *****/ + +/// Log message severity and source location info. +struct LogMessageEnvelope { + enum Severity { + kAssertFailed = -3, + kError = -2, + kWarning = -1, + kInfo = 0, + }; + // An 'enum Severity' value, or a positive number indicating verbosity level. + int severity; + const char *func; + const char *file; + int32 line; }; -// Class KaldiVlogMessage is invoked from the KALDI_VLOG macro. -class KaldiVlogMessage { - public: - KaldiVlogMessage(const char *func, const char *file, int32 line, - int32 verbose_level); - inline std::ostream &stream() { return ss; } - ~KaldiVlogMessage() { fprintf(stderr, "%s\n", ss.str().c_str()); } - private: - std::ostringstream ss; +// Class MessageLogger is invoked from the KALDI_ASSERT, KALDI_ERR, KALDI_WARN and +// KALDI_LOG macros. It formats the message, then either prints it to stderr or +// passes to the log custom handler if provided, then, in case of the error, +// throws an std::runtime_exception, in case of failed KALDI_ASSERT calls abort(). +// +// Note: we avoid using std::cerr for thread safety issues. +// fprintf(stderr,...) is guaranteed thread-safe, and outputs +// its formatted string atomically. +class MessageLogger { +public: + /// Constructor stores the info, + MessageLogger(LogMessageEnvelope::Severity severity, + const char *func, + const char *file, + int32 line); + + /// Destructor, calls 'HandleMessage' which prints the message, + /// (since C++11 a 'throwing' destructor must be declared 'noexcept(false)') + ~MessageLogger() KALDI_NOEXCEPT(false); + + /// The hook for the 'insertion operator', e.g. + /// 'KALDI_LOG << "Message,"', + inline std::ostream &stream() { return ss_; } + +private: + /// The logging function, + static void HandleMessage(const LogMessageEnvelope &env, const char *msg); + +private: + LogMessageEnvelope envelope_; + std::ostringstream ss_; }; - -// class KaldiErrorMessage is invoked from the KALDI_ERROR macro. -// The destructor throws an exception. -class KaldiErrorMessage { - public: - KaldiErrorMessage(const char *func, const char *file, int32 line); - inline std::ostream &stream() { return ss; } - ~KaldiErrorMessage() KALDI_NOEXCEPT(false); // defined in kaldi-error.cc - private: - std::ostringstream ss; -}; +// The definition of the logging macros, +#define KALDI_ERR \ + ::kaldi::MessageLogger(::kaldi::LogMessageEnvelope::kError, \ + __func__, __FILE__, __LINE__).stream() +#define KALDI_WARN \ + ::kaldi::MessageLogger(::kaldi::LogMessageEnvelope::kWarning, \ + __func__, __FILE__, __LINE__).stream() +#define KALDI_LOG \ + ::kaldi::MessageLogger(::kaldi::LogMessageEnvelope::kInfo, \ + __func__, __FILE__, __LINE__).stream() +#define KALDI_VLOG(v) if ((v) <= ::kaldi::g_kaldi_verbose_level) \ + ::kaldi::MessageLogger((::kaldi::LogMessageEnvelope::Severity)(v), \ + __func__, __FILE__, __LINE__).stream() +/***** KALDI ASSERTS *****/ -#ifdef _MSC_VER -#define __func__ __FUNCTION__ -#endif +void KaldiAssertFailure_(const char *func, const char *file, + int32 line, const char *cond_str); // Note on KALDI_ASSERT and KALDI_PARANOID_ASSERT // The original (simple) version of the code was this // -// #define KALDI_ASSERT(cond) if (!(cond)) kaldi::KaldiAssertFailure_(__func__, __FILE__, __LINE__, #cond); +// #define KALDI_ASSERT(cond) if (!(cond)) +// kaldi::KaldiAssertFailure_(__func__, __FILE__, __LINE__, #cond); // // That worked well, but we were concerned that it // could potentially cause a performance issue due to failed branch @@ -139,35 +165,34 @@ class KaldiErrorMessage { // and compilers will be able to optimize the loop away (as the condition // is always false). #ifndef NDEBUG -#define KALDI_ASSERT(cond) \ - do { if ((cond)) ; else kaldi::KaldiAssertFailure_(__func__, __FILE__, __LINE__, #cond);} while(0) +#define KALDI_ASSERT(cond) do { if (cond) (void)0; else \ + ::kaldi::KaldiAssertFailure_(__func__, __FILE__, __LINE__, #cond); } while(0) #else -#define KALDI_ASSERT(cond) +#define KALDI_ASSERT(cond) (void)0 #endif -// also see KALDI_COMPILE_TIME_ASSERT, defined in base/kaldi-utils.h, +// Also see KALDI_COMPILE_TIME_ASSERT, defined in base/kaldi-utils.h, // and KALDI_ASSERT_IS_INTEGER_TYPE and KALDI_ASSERT_IS_FLOATING_TYPE, // also defined there. -#ifdef KALDI_PARANOID // some more expensive asserts only checked if this defined -#define KALDI_PARANOID_ASSERT(cond) \ - do { if ((cond)) ; else kaldi::KaldiAssertFailure_(__func__, __FILE__, __LINE__, #cond);} while(0) +// some more expensive asserts only checked if this defined +#ifdef KALDI_PARANOID +#define KALDI_PARANOID_ASSERT(cond) do { if (cond) (void)0; else \ + ::kaldi::KaldiAssertFailure_(__func__, __FILE__, __LINE__, #cond); } while(0) #else -#define KALDI_PARANOID_ASSERT(cond) +#define KALDI_PARANOID_ASSERT(cond) (void)0 #endif -#define KALDI_ERR kaldi::KaldiErrorMessage(__func__, __FILE__, __LINE__).stream() -#define KALDI_WARN kaldi::KaldiWarnMessage(__func__, __FILE__, __LINE__).stream() -#define KALDI_LOG kaldi::KaldiLogMessage(__func__, __FILE__, __LINE__).stream() +/***** THIRD-PARTY LOG-HANDLER *****/ -#define KALDI_VLOG(v) if (v <= kaldi::g_kaldi_verbose_level) \ - kaldi::KaldiVlogMessage(__func__, __FILE__, __LINE__, v).stream() +/// Type of third-party logging function, +typedef void (*LogHandler)(const LogMessageEnvelope &envelope, + const char *message); -inline bool IsKaldiError(const std::string &str) { - return(!strncmp(str.c_str(), "ERROR ", 6)); -} - -void KaldiAssertFailure_(const char *func, const char *file, - int32 line, const char *cond_str); +/// Set logging handler. If called with a non-NULL function pointer, the +/// function pointed by it is called to send messages to a caller-provided +/// log. If called with NULL pointer, restores default Kaldi error logging to +/// stderr. SetLogHandler is obviously not thread safe. +LogHandler SetLogHandler(LogHandler); /// @} end "addtogroup error_group" diff --git a/src/base/kaldi-math-test.cc b/src/base/kaldi-math-test.cc index 3026f05502f..52719cc4669 100644 --- a/src/base/kaldi-math-test.cc +++ b/src/base/kaldi-math-test.cc @@ -1,5 +1,5 @@ // base/kaldi-math-test.cc -// +// // Copyright 2009-2011 Microsoft Corporation; Yanmin Qian; Jan Silovsky // See ../../COPYING for clarification regarding multiple authors @@ -17,8 +17,8 @@ // See the Apache 2 License for the specific language governing permissions and // limitations under the License. #include "base/kaldi-math.h" -#include "base/timer.h" #include +#include "base/timer.h" namespace kaldi { @@ -37,7 +37,7 @@ template void UnitTestGcdLcmTpl() { KALDI_ASSERT((c*a) % g == 0); // test least common multiple - if (b <= 0 || c <= 0) continue; // lcm not defined unless both positive. + if (b <= 0 || c <= 0) continue; // lcm not defined unless both positive. I h = Lcm(b*a, c*a); KALDI_ASSERT(h != 0 && (h % (b*a)) == 0 && (h % (c*a)) == 0); @@ -54,18 +54,17 @@ void UnitTestRoundUpToNearestPowerOfTwo() { KALDI_ASSERT(RoundUpToNearestPowerOfTwo(255) == 256); KALDI_ASSERT(RoundUpToNearestPowerOfTwo(256) == 256); KALDI_ASSERT(RoundUpToNearestPowerOfTwo(257) == 512); - KALDI_ASSERT(RoundUpToNearestPowerOfTwo(1073700000) == 1073741824 ); + KALDI_ASSERT(RoundUpToNearestPowerOfTwo(1073700000) == 1073741824); } void UnitTestGcdLcm() { UnitTestGcdLcmTpl(); UnitTestGcdLcmTpl(); - UnitTestGcdLcmTpl(); + UnitTestGcdLcmTpl(); } void UnitTestRand() { // Testing random-number generation. - using namespace kaldi; std::cout << "Testing random-number generation. " << "If there is an error this may not terminate.\n"; std::cout << "If this does not terminate, look more closely. " @@ -77,14 +76,14 @@ void UnitTestRand() { float sum = RandUniform()-0.5; for (int j = 0; ; j++) { sum += RandUniform()-0.5; - if (std::abs(sum) < 0.5*sqrt((double)j)) break; + if (std::abs(sum) < 0.5*sqrt(static_cast(j))) break; } } { // test RandGauss. float sum = RandGauss(); for (int j = 0; ; j++) { sum += RandGauss(); - if (std::abs(sum) < 0.5*sqrt((double)j)) break; + if (std::abs(sum) < 0.5*sqrt(static_cast(j))) break; } } { // test RandGauss. @@ -93,8 +92,9 @@ void UnitTestRand() { float a, b; RandGauss2(&a, &b); if (i % 2 == 0) sum += a; - else sum += b; - if (std::abs(sum) < 0.5*sqrt((double)j)) break; + else + sum += b; + if (std::abs(sum) < 0.5*sqrt(static_cast(j))) break; } } { // test poisson_Rand(). @@ -105,7 +105,7 @@ void UnitTestRand() { double sum = RandPoisson(lambda) - lambda; // expected value is zero. for (int j = 0; ; j++) { sum += RandPoisson(lambda) - lambda; - if (std::abs(sum) < 0.5*sqrt((double)j)) break; + if (std::abs(sum) < 0.5*sqrt(static_cast(j))) break; } } @@ -138,7 +138,8 @@ void UnitTestRand() { float sum = RandInt(minint, maxint) + 0.5*(minint+maxint); for (int j = 0; ; j++) { sum += RandInt(minint, maxint) - 0.5*(minint+maxint); - if (std::abs((float)sum) < 0.5*sqrt((double)j)*(maxint-minint)) break; + if (std::abs(static_cast(sum)) < + 0.5*sqrt(static_cast(j))*(maxint-minint)) break; } } { // test RandPrune in basic way. @@ -157,7 +158,6 @@ void UnitTestRand() { } void UnitTestLogAddSub() { - using namespace kaldi; for (int i = 0; i < 100; i++) { double f1 = Rand() % 10000, f2 = Rand() % 20; double add1 = Exp(LogAdd(Log(f1), Log(f2))); @@ -167,7 +167,8 @@ void UnitTestLogAddSub() { try { - double f2_check = Exp(LogSub(Log(add), Log(f1))), thresh = (f2*0.01)+0.001; + double f2_check = Exp(LogSub(Log(add), Log(f1))), + thresh = (f2*0.01)+0.001; KALDI_ASSERT(std::abs(f2_check-f2) < thresh); } catch(...) { KALDI_ASSERT(f2 == 0); // It will probably crash for f2=0. @@ -192,17 +193,20 @@ void UnitTestDefines() { // Yes, we even unit-test the preprocessor statements. std::cout << 1.0+DBL_EPSILON; std::cout << 1.0 + 0.5*DBL_EPSILON; KALDI_ASSERT(1.0 + DBL_EPSILON != 1.0 && 1.0 + (0.5*DBL_EPSILON) == 1.0 - && "If this test fails, you can probably just comment it out-- may mean your CPU exceeds expected floating point precision"); + && "If this test fails, you can probably just comment it out-- " + "may mean your CPU exceeds expected floating point precision"); KALDI_ASSERT(1.0f + FLT_EPSILON != 1.0f && 1.0f + (0.5f*FLT_EPSILON) == 1.0f - && "If this test fails, you can probably just comment it out-- may mean your CPU exceeds expected floating point precision"); - KALDI_ASSERT(std::abs(sin(M_PI)) < 1.0e-05 && std::abs(cos(M_PI)+1.0) < 1.0e-05); - KALDI_ASSERT(std::abs(sin(M_2PI)) < 1.0e-05 && std::abs(cos(M_2PI)-1.0) < 1.0e-05); + && "If this test fails, you can probably just comment it out-- " + "may mean your CPU exceeds expected floating point precision"); + KALDI_ASSERT(std::abs(sin(M_PI)) < 1.0e-05 + && std::abs(cos(M_PI)+1.0) < 1.0e-05); + KALDI_ASSERT(std::abs(sin(M_2PI)) < 1.0e-05 + && std::abs(cos(M_2PI)-1.0) < 1.0e-05); KALDI_ASSERT(std::abs(sin(Exp(M_LOG_2PI))) < 1.0e-05); KALDI_ASSERT(std::abs(cos(Exp(M_LOG_2PI)) - 1.0) < 1.0e-05); } void UnitTestAssertFunc() { // Testing Assert** *functions - using namespace kaldi; for (int i = 1; i < 100; i++) { float f1 = Rand() % 10000 + 1, f2 = Rand() % 20 + 1; float tmp1 = f1 * f2; @@ -234,7 +238,7 @@ template void UnitTestFactorizeTpl() { void UnitTestFactorize() { UnitTestFactorizeTpl(); UnitTestFactorizeTpl(); - UnitTestFactorizeTpl(); + UnitTestFactorizeTpl(); } void UnitTestApproxEqual() { @@ -254,7 +258,7 @@ void UnitTestApproxEqual() { KALDI_ASSERT(!ApproxEqual(-std::numeric_limits::infinity(), 0)); KALDI_ASSERT(!ApproxEqual(-std::numeric_limits::infinity(), - 1)); + 1)); } template @@ -273,8 +277,8 @@ void UnitTestExpSpeed() { KALDI_ASSERT(sum > 0.0); // make it harder for the compiler to optimize Exp // away, as we have a conditional. Real flops = 1.0e-06 * num_ops / tim.Elapsed(); - KALDI_LOG << "Megaflops doing Exp(" << (sizeof(Real) == 4 ? "float" : "double") - << ") is " << flops; + KALDI_LOG << "Megaflops doing Exp(" + << (sizeof(Real) == 4 ? "float" : "double") << ") is " << flops; } @@ -287,15 +291,15 @@ void UnitTestLogSpeed() { Timer tim; while (tim.Elapsed() < time) { for (int i = 0; i < block_size; i++) { - sum += Log((float)(i + 1)); + sum += Log(static_cast(i + 1)); } num_ops += block_size; } KALDI_ASSERT(sum > 0.0); // make it harder for the compiler to optimize Log // away, as we have a conditional. Real flops = 1.0e-06 * num_ops / tim.Elapsed(); - KALDI_LOG << "Megaflops doing Log(" << (sizeof(Real) == 4 ? "float" : "double") - << ") is " << flops; + KALDI_LOG << "Megaflops doing Log(" + << (sizeof(Real) == 4 ? "float" : "double") << ") is " << flops; } } // end namespace kaldi. diff --git a/src/base/kaldi-math.cc b/src/base/kaldi-math.cc index 3496794e78a..dd269fd0cbc 100644 --- a/src/base/kaldi-math.cc +++ b/src/base/kaldi-math.cc @@ -18,11 +18,11 @@ // See the Apache 2 License for the specific language governing permissions and // limitations under the License. -#include #include "base/kaldi-math.h" #ifndef _MSC_VER #include #endif +#include namespace kaldi { // These routines are tested in matrix/matrix-test.cc @@ -42,16 +42,14 @@ int32 RoundUpToNearestPowerOfTwo(int32 n) { static pthread_mutex_t _RandMutex = PTHREAD_MUTEX_INITIALIZER; #endif -int Rand(struct RandomState* state) -{ +int Rand(struct RandomState* state) { #ifdef _MSC_VER // On Windows, just call Rand() return rand(); #else if (state) { return rand_r(&(state->seed)); - } - else { + } else { int rs = pthread_mutex_lock(&_RandMutex); KALDI_ASSERT(rs == 0); int val = rand(); @@ -86,7 +84,7 @@ bool WithProb(BaseFloat prob, struct RandomState* state) { // prob is very small but nonzero, and the "main algorithm" // wouldn't work that well. So: with probability 1/128, we // return WithProb (prob * 128), else return false. - if (Rand(state) < RAND_MAX / 128) { // with probability 128... + if (Rand(state) < RAND_MAX / 128) { // with probability 128... // Note: we know that prob * 128.0 < 1.0, because // we asserted RAND_MAX > 128 * 128. return WithProb(prob * 128.0); @@ -98,7 +96,8 @@ bool WithProb(BaseFloat prob, struct RandomState* state) { } } -int32 RandInt(int32 min_val, int32 max_val, struct RandomState* state) { // This is not exact. +int32 RandInt(int32 min_val, int32 max_val, struct RandomState* state) { + // This is not exact. KALDI_ASSERT(max_val >= min_val); if (max_val == min_val) return min_val; @@ -106,9 +105,11 @@ int32 RandInt(int32 min_val, int32 max_val, struct RandomState* state) { // Thi // RAND_MAX is quite small on Windows -> may need to handle larger numbers. if (RAND_MAX > (max_val-min_val)*8) { // *8 to avoid large inaccuracies in probability, from the modulus... - return min_val + ((unsigned int)Rand(state) % (unsigned int)(max_val+1-min_val)); + return min_val + + ((unsigned int)Rand(state) % (unsigned int)(max_val+1-min_val)); } else { - if ((unsigned int)(RAND_MAX*RAND_MAX) > (unsigned int)((max_val+1-min_val)*8)) { + if ((unsigned int)(RAND_MAX*RAND_MAX) > + (unsigned int)((max_val+1-min_val)*8)) { // *8 to avoid inaccuracies in probability, from the modulus... return min_val + ( (unsigned int)( (Rand(state)+RAND_MAX*Rand(state))) % (unsigned int)(max_val+1-min_val)); @@ -121,7 +122,7 @@ int32 RandInt(int32 min_val, int32 max_val, struct RandomState* state) { // Thi } #else return min_val + - (static_cast(Rand(state)) % (int32)(max_val+1-min_val)); + (static_cast(Rand(state)) % static_cast(max_val+1-min_val)); #endif } @@ -141,8 +142,7 @@ int32 RandPoisson(float lambda, struct RandomState* state) { return k-1; } -void RandGauss2(float *a, float *b, RandomState *state) -{ +void RandGauss2(float *a, float *b, RandomState *state) { KALDI_ASSERT(a); KALDI_ASSERT(b); float u1 = RandUniform(state); @@ -153,15 +153,15 @@ void RandGauss2(float *a, float *b, RandomState *state) *b = u1 * sinf(u2); } -void RandGauss2(double *a, double *b, RandomState *state) -{ +void RandGauss2(double *a, double *b, RandomState *state) { KALDI_ASSERT(a); KALDI_ASSERT(b); float a_float, b_float; // Just because we're using doubles doesn't mean we need super-high-quality // random numbers, so we just use the floating-point version internally. RandGauss2(&a_float, &b_float, state); - *a = a_float; *b = b_float; + *a = a_float; + *b = b_float; } diff --git a/src/base/kaldi-math.h b/src/base/kaldi-math.h index edbc8010195..ac590a06a25 100644 --- a/src/base/kaldi-math.h +++ b/src/base/kaldi-math.h @@ -41,20 +41,19 @@ #endif #ifndef M_PI -# define M_PI 3.1415926535897932384626433832795 +#define M_PI 3.1415926535897932384626433832795 #endif #ifndef M_SQRT2 -# define M_SQRT2 1.4142135623730950488016887 +#define M_SQRT2 1.4142135623730950488016887 #endif - #ifndef M_2PI -# define M_2PI 6.283185307179586476925286766559005 +#define M_2PI 6.283185307179586476925286766559005 #endif #ifndef M_SQRT1_2 -# define M_SQRT1_2 0.7071067811865475244008443621048490 +#define M_SQRT1_2 0.7071067811865475244008443621048490 #endif #ifndef M_LOG_2PI @@ -65,6 +64,11 @@ #define M_LN2 0.693147180559945309417232121458 #endif +#ifndef M_LN10 +#define M_LN10 2.302585092994045684017991454684 +#endif + + #define KALDI_ISNAN std::isnan #define KALDI_ISINF std::isinf #define KALDI_ISFINITE(x) std::isfinite(x) @@ -81,7 +85,7 @@ inline double Exp(double x) { return exp(x); } inline float Exp(float x) { return expf(x); } #else inline float Exp(float x) { return exp(static_cast(x)); } -#endif // KALDI_NO_EXPF +#endif // KALDI_NO_EXPF #else inline double Exp(double x) { return exp(x); } #if !defined(__INTEL_COMPILER) && _MSC_VER == 1800 && defined(_M_X64) @@ -90,8 +94,8 @@ inline double Exp(double x) { return exp(x); } inline float Exp(float x) { return exp(static_cast(x)); } #else inline float Exp(float x) { return expf(x); } -#endif // !defined(__INTEL_COMPILER) && _MSC_VER == 1800 && defined(_M_X64) -#endif // !defined(_MSC_VER) || (_MSC_VER >= 1900) +#endif // !defined(__INTEL_COMPILER) && _MSC_VER == 1800 && defined(_M_X64) +#endif // !defined(_MSC_VER) || (_MSC_VER >= 1900) inline double Log(double x) { return log(x); } inline float Log(float x) { return logf(x); } @@ -126,7 +130,7 @@ const double kLogZeroDouble = -std::numeric_limits::infinity(); const BaseFloat kLogZeroBaseFloat = -std::numeric_limits::infinity(); // Returns a random integer between 0 and RAND_MAX, inclusive -int Rand(struct RandomState* state=NULL); +int Rand(struct RandomState* state = NULL); // State for thread-safe random number generator struct RandomState { @@ -135,9 +139,10 @@ struct RandomState { }; // Returns a random integer between min and max inclusive. -int32 RandInt(int32 min, int32 max, struct RandomState* state=NULL); +int32 RandInt(int32 min, int32 max, struct RandomState* state = NULL); -bool WithProb(BaseFloat prob, struct RandomState* state=NULL); // Returns true with probability "prob", +// Returns true with probability "prob", +bool WithProb(BaseFloat prob, struct RandomState* state = NULL); // with 0 <= prob <= 1 [we check this]. // Internally calls Rand(). This function is carefully implemented so // that it should work even if prob is very small. @@ -155,7 +160,7 @@ inline float RandGauss(struct RandomState* state = NULL) { // Returns poisson-distributed random number. Uses Knuth's algorithm. // Take care: this takes time proportinal // to lambda. Faster algorithms exist but are more complex. -int32 RandPoisson(float lambda, struct RandomState* state=NULL); +int32 RandPoisson(float lambda, struct RandomState* state = NULL); // Returns a pair of gaussian random numbers. Uses Box-Muller transform void RandGauss2(float *a, float *b, RandomState *state = NULL); @@ -166,7 +171,8 @@ void RandGauss2(double *a, double *b, RandomState *state = NULL); // This is a randomized pruning mechanism that preserves expectations, // that we typically use to prune posteriors. template -inline Float RandPrune(Float post, BaseFloat prune_thresh, struct RandomState* state=NULL) { +inline Float RandPrune(Float post, BaseFloat prune_thresh, + struct RandomState* state = NULL) { KALDI_ASSERT(prune_thresh >= 0.0); if (post == 0.0 || std::abs(post) >= prune_thresh) return post; @@ -256,11 +262,11 @@ inline float LogSub(float x, float y) { static inline bool ApproxEqual(float a, float b, float relative_tolerance = 0.001) { // a==b handles infinities. - if (a==b) return true; + if (a == b) return true; float diff = std::abs(a-b); if (diff == std::numeric_limits::infinity() - || diff != diff) return false; // diff is +inf or nan. - return (diff <= relative_tolerance*(std::abs(a)+std::abs(b))); + || diff != diff) return false; // diff is +inf or nan. + return (diff <= relative_tolerance*(std::abs(a)+std::abs(b))); } /// assert abs(a - b) <= relative_tolerance * (abs(a)+abs(b)) diff --git a/src/base/kaldi-types.h b/src/base/kaldi-types.h index c67529eb917..7ebf4f85386 100644 --- a/src/base/kaldi-types.h +++ b/src/base/kaldi-types.h @@ -39,15 +39,37 @@ typedef float BaseFloat; // we find in the future lacks stdint.h #include +// for discussion on what to do if you need compile kaldi +// without OpenFST, see the bottom of this this file +#include + namespace kaldi { -typedef uint16_t uint16; -typedef uint32_t uint32; -typedef uint64_t uint64; -typedef int16_t int16; -typedef int32_t int32; -typedef int64_t int64; -typedef float float32; -typedef double double64; + using ::int16; + using ::int32; + using ::int64; + using ::uint16; + using ::uint32; + using ::uint64; + typedef float float32; + typedef double double64; +} // end namespace kaldi + +// In a theoretical case you decide compile Kaldi without the OpenFST +// comment the previous namespace statement and uncomment the following +/* +namespace kaldi { + typedef int8_t int8; + typedef int16_t int16; + typedef int32_t int32; + typedef int64_t int64; + + typedef uint8_t uint8; + typedef uint16_t uint16; + typedef uint32_t uint32; + typedef uint64_t uint64; + typedef float float32; + typedef double double64; } // end namespace kaldi +*/ #endif // KALDI_BASE_KALDI_TYPES_H_ diff --git a/src/base/kaldi-utils.cc b/src/base/kaldi-utils.cc index c7d82a7c4c5..1ae1dc0b758 100644 --- a/src/base/kaldi-utils.cc +++ b/src/base/kaldi-utils.cc @@ -16,26 +16,29 @@ // See the Apache 2 License for the specific language governing permissions and // limitations under the License. -#include -#include "base/kaldi-common.h" - - #ifdef _WIN32_WINNT_WIN8 #include -#elif defined (_WIN32) || defined(_MSC_VER) || defined(MINGW) +#elif defined(_WIN32) || defined(_MSC_VER) || defined(MINGW) #include +#if defined(_MSC_VER) && _MSC_VER < 1900 +#define snprintf _snprintf +#endif /* _MSC_VER < 1900 */ #else #include #endif +#include +#include "base/kaldi-common.h" + + namespace kaldi { std::string CharToString(const char &c) { char buf[20]; if (std::isprint(c)) - sprintf(buf, "\'%c\'", c); + snprintf(buf, sizeof(buf), "\'%c\'", c); else - sprintf(buf, "[character %d]", (int) c); + snprintf(buf, sizeof(buf), "[character %d]", static_cast(c)); return (std::string) buf; } diff --git a/src/base/kaldi-utils.h b/src/base/kaldi-utils.h index deac0f6b634..47c60b4b01d 100644 --- a/src/base/kaldi-utils.h +++ b/src/base/kaldi-utils.h @@ -21,15 +21,22 @@ #ifndef KALDI_BASE_KALDI_UTILS_H_ #define KALDI_BASE_KALDI_UTILS_H_ 1 -#include -#include - #if defined(_MSC_VER) # define WIN32_LEAN_AND_MEAN # define NOMINMAX # include #endif +#ifdef _MSC_VER +#include +#define unlink _unlink +#else +#include +#endif + +#include +#include + #if defined(_MSC_VER) #pragma warning(disable: 4244 4056 4305 4800 4267 4996 4756 4661) #if _MSC_VER < 1400 @@ -39,22 +46,14 @@ #endif #endif -#ifdef HAVE_POSIX_MEMALIGN -# define KALDI_MEMALIGN(align, size, pp_orig) \ - (!posix_memalign(pp_orig, align, size) ? *(pp_orig) : NULL) -# define KALDI_MEMALIGN_FREE(x) free(x) -#elif defined(HAVE_MEMALIGN) - /* Some systems have memalign() but no declaration for it */ - void * memalign(size_t align, size_t size); -# define KALDI_MEMALIGN(align, size, pp_orig) \ - (*(pp_orig) = memalign(align, size)) -# define KALDI_MEMALIGN_FREE(x) free(x) -#elif defined(_MSC_VER) +#ifdef _MSC_VER # define KALDI_MEMALIGN(align, size, pp_orig) \ (*(pp_orig) = _aligned_malloc(size, align)) # define KALDI_MEMALIGN_FREE(x) _aligned_free(x) #else -#error Manual memory alignment is no longer supported +# define KALDI_MEMALIGN(align, size, pp_orig) \ + (!posix_memalign(pp_orig, align, size) ? *(pp_orig) : NULL) +# define KALDI_MEMALIGN_FREE(x) free(x) #endif #ifdef __ICC @@ -82,22 +81,36 @@ inline int MachineIsLittleEndian() { return (*reinterpret_cast(&check) != 0); } -// This function kaldi::Sleep() provides a portable way to sleep for a possibly fractional +// This function kaldi::Sleep() provides a portable way +// to sleep for a possibly fractional // number of seconds. On Windows it's only accurate to microseconds. void Sleep(float seconds); - } #define KALDI_SWAP8(a) { \ - int t = ((char*)&a)[0]; ((char*)&a)[0]=((char*)&a)[7]; ((char*)&a)[7]=t;\ - t = ((char*)&a)[1]; ((char*)&a)[1]=((char*)&a)[6]; ((char*)&a)[6]=t;\ - t = ((char*)&a)[2]; ((char*)&a)[2]=((char*)&a)[5]; ((char*)&a)[5]=t;\ - t = ((char*)&a)[3]; ((char*)&a)[3]=((char*)&a)[4]; ((char*)&a)[4]=t;} + int t = (reinterpret_cast(&a))[0];\ + (reinterpret_cast(&a))[0]=(reinterpret_cast(&a))[7];\ + (reinterpret_cast(&a))[7]=t;\ + t = (reinterpret_cast(&a))[1];\ + (reinterpret_cast(&a))[1]=(reinterpret_cast(&a))[6];\ + (reinterpret_cast(&a))[6]=t;\ + t = (reinterpret_cast(&a))[2];\ + (reinterpret_cast(&a))[2]=(reinterpret_cast(&a))[5];\ + (reinterpret_cast(&a))[5]=t;\ + t = (reinterpret_cast(&a))[3];\ + (reinterpret_cast(&a))[3]=(reinterpret_cast(&a))[4];\ + (reinterpret_cast(&a))[4]=t;} #define KALDI_SWAP4(a) { \ - int t = ((char*)&a)[0]; ((char*)&a)[0]=((char*)&a)[3]; ((char*)&a)[3]=t;\ - t = ((char*)&a)[1]; ((char*)&a)[1]=((char*)&a)[2]; ((char*)&a)[2]=t;} + int t = (reinterpret_cast(&a))[0];\ + (reinterpret_cast(&a))[0]=(reinterpret_cast(&a))[3];\ + (reinterpret_cast(&a))[3]=t;\ + t = (reinterpret_cast(&a))[1];\ + (reinterpret_cast(&a))[1]=(reinterpret_cast(&a))[2];\ + (reinterpret_cast(&a))[2]=t;} #define KALDI_SWAP2(a) { \ - int t = ((char*)&a)[0]; ((char*)&a)[0]=((char*)&a)[1]; ((char*)&a)[1]=t;} + int t = (reinterpret_cast(&a))[0];\ + (reinterpret_cast(&a))[0]=(reinterpret_cast(&a))[1];\ + (reinterpret_cast(&a))[1]=t;} // Makes copy constructor and operator= private. Same as in compat.h of OpenFst @@ -109,7 +122,7 @@ void Sleep(float seconds); template class KaldiCompileTimeAssert { }; template<> class KaldiCompileTimeAssert { public: - static inline void Check() { } + static inline void Check() { } }; #define KALDI_COMPILE_TIME_ASSERT(b) KaldiCompileTimeAssert<(b)>::Check() @@ -122,14 +135,6 @@ template<> class KaldiCompileTimeAssert { KaldiCompileTimeAssert::is_specialized \ && !std::numeric_limits::is_integer>::Check() -#ifdef _MSC_VER -#include -#define unlink _unlink -#else -#include -#endif - - #ifdef _MSC_VER #define KALDI_STRCASECMP _stricmp #else diff --git a/src/base/timer-test.cc b/src/base/timer-test.cc index 32ceebd9f6e..86a20b486ee 100644 --- a/src/base/timer-test.cc +++ b/src/base/timer-test.cc @@ -36,7 +36,6 @@ void TimerTest() { KALDI_ERR << "Timer fail: waited " << f << " seconds instead of " << time_secs << " secs."; } - } diff --git a/src/base/timer.h b/src/base/timer.h index d93a46143c2..eff7da31529 100644 --- a/src/base/timer.h +++ b/src/base/timer.h @@ -25,9 +25,7 @@ #if defined(_MSC_VER) || defined(MINGW) -namespace kaldi -{ - +namespace kaldi { class Timer { public: Timer() { Reset(); } @@ -38,9 +36,14 @@ class Timer { LARGE_INTEGER time_end; LARGE_INTEGER freq; QueryPerformanceCounter(&time_end); - if (QueryPerformanceFrequency(&freq) == 0) return 0.0; // Hardware does not support this. - return ((double)time_end.QuadPart - (double)time_start_.QuadPart) / - ((double)freq.QuadPart); + + if (QueryPerformanceFrequency(&freq) == 0) { + // Hardware does not support this. + return 0.0; + } + return (static_cast(time_end.QuadPart) - + static_cast(time_start_.QuadPart)) / + (static_cast(freq.QuadPart)); } private: LARGE_INTEGER time_start_; @@ -48,13 +51,11 @@ class Timer { } #else +#include +#include -# include -# include -namespace kaldi -{ -class Timer -{ +namespace kaldi { +class Timer { public: Timer() { Reset(); } @@ -65,9 +66,10 @@ class Timer struct timeval time_end; gettimeofday(&time_end, &time_zone_); double t1, t2; - t1 = (double)time_start_.tv_sec + - (double)time_start_.tv_usec/(1000*1000); - t2 = (double)time_end.tv_sec + (double)time_end.tv_usec/(1000*1000); + t1 = static_cast(time_start_.tv_sec) + + static_cast(time_start_.tv_usec)/(1000*1000); + t2 = static_cast(time_end.tv_sec) + + static_cast(time_end.tv_usec)/(1000*1000); return t2-t1; } @@ -80,4 +82,4 @@ class Timer #endif -#endif +#endif // KALDI_BASE_TIMER_H_ diff --git a/src/bin/Makefile b/src/bin/Makefile index ac175e42e0e..a1df9b5d48a 100644 --- a/src/bin/Makefile +++ b/src/bin/Makefile @@ -1,12 +1,14 @@ all: + -rm -f arpa2fst EXTRA_CXXFLAGS = -Wno-sign-compare include ../kaldi.mk BINFILES = align-equal align-equal-compiled acc-tree-stats \ show-alignments compile-questions cluster-phones \ - compute-wer make-h-transducer add-self-loops convert-ali \ - compile-train-graphs compile-train-graphs-fsts arpa2fst \ + compute-wer compute-wer-bootci make-h-transducer \ + add-self-loops convert-ali \ + compile-train-graphs compile-train-graphs-fsts \ make-pdf-to-tid-transducer make-ilabel-transducer show-transitions \ ali-to-phones ali-to-post weight-silence-post acc-lda est-lda \ ali-to-pdf est-mllt build-tree build-tree-two-level decode-faster \ @@ -36,4 +38,3 @@ ADDLIBS = ../lm/kaldi-lm.a ../decoder/kaldi-decoder.a ../lat/kaldi-lat.a \ TESTFILES = include ../makefiles/default_rules.mk - diff --git a/src/bin/acc-lda.cc b/src/bin/acc-lda.cc index 8169ae79bde..92cd192b9a6 100644 --- a/src/bin/acc-lda.cc +++ b/src/bin/acc-lda.cc @@ -86,7 +86,7 @@ int main(int argc, char *argv[]) { if (feats.NumRows() != static_cast(post.size())) { KALDI_WARN << "Posterior vs. feats size mismatch " - << feats.NumRows() << " vs. " < \n" "e.g.: \n" " acc-tree-stats 1.mdl scp:train.scp ark:1.ali 1.tacc\n"; - ParseOptions po(usage); + bool binary = true; - float var_floor = 0.01; - string ci_phones_str; - std::string phone_map_rxfilename; - int N = 3; - int P = 1; + AccumulateTreeStatsOptions opts; + ParseOptions po(usage); po.Register("binary", &binary, "Write output in binary mode"); - po.Register("var-floor", &var_floor, "Variance floor for tree clustering."); - po.Register("ci-phones", &ci_phones_str, "Colon-separated list of integer " - "indices of context-independent phones (after mapping, if " - "--phone-map option is used)."); - po.Register("context-width", &N, "Context window size."); - po.Register("central-position", &P, "Central context-window position " - "(zero-based)"); - po.Register("phone-map", &phone_map_rxfilename, - "File name containing old->new phone mapping (each line is: " - "old-integer-id new-integer-id)"); - + opts.Register(&po); + po.Read(argc, argv); - if (po.NumArgs() < 3 || po.NumArgs() > 4) { + if (po.NumArgs() != 4) { po.PrintUsage(); exit(1); } @@ -71,22 +59,8 @@ int main(int argc, char *argv[]) { alignment_rspecifier = po.GetArg(3), accs_out_wxfilename = po.GetOptArg(4); - std::vector phone_map; - if (phone_map_rxfilename != "") { // read phone map. - ReadPhoneMap(phone_map_rxfilename, - &phone_map); - } - - std::vector ci_phones; - if (ci_phones_str != "") { - SplitStringToIntegers(ci_phones_str, ":", false, &ci_phones); - std::sort(ci_phones.begin(), ci_phones.end()); - if (!IsSortedAndUniq(ci_phones) || ci_phones[0] == 0) { - KALDI_ERR << "Invalid set of ci_phones: " << ci_phones_str; - } - } - + AccumulateTreeStatsInfo acc_tree_stats_info(opts); TransitionModel trans_model; { @@ -117,15 +91,10 @@ int main(int argc, char *argv[]) { continue; } - ////// This is the important part of this program. //////// AccumulateTreeStats(trans_model, - var_floor, - N, - P, - ci_phones, + acc_tree_stats_info, alignment, mat, - (phone_map_rxfilename != "" ? &phone_map : NULL), &tree_stats); num_done++; if (num_done % 1000 == 0) @@ -135,9 +104,9 @@ int main(int argc, char *argv[]) { BuildTreeStatsType stats; // vectorized form. - for (std::map::const_iterator iter = tree_stats.begin(); - iter != tree_stats.end(); - iter++ ) { + for (std::map::const_iterator iter = tree_stats.begin(); + iter != tree_stats.end(); + ++iter) { stats.push_back(std::make_pair(iter->first, iter->second)); } tree_stats.clear(); diff --git a/src/bin/ali-to-phones.cc b/src/bin/ali-to-phones.cc index 9b95721bd33..b370dbc7f18 100644 --- a/src/bin/ali-to-phones.cc +++ b/src/bin/ali-to-phones.cc @@ -67,7 +67,7 @@ int main(int argc, char *argv[]) { std::string model_filename = po.GetArg(1), alignments_rspecifier = po.GetArg(2); - + TransitionModel trans_model; ReadKaldiObject(model_filename, &trans_model); @@ -77,12 +77,12 @@ int main(int argc, char *argv[]) { (write_lengths ? empty : po.GetArg(3))); Int32PairVectorWriter pair_writer(ctm_output ? empty : (write_lengths ? po.GetArg(3) : empty)); - + std::string ctm_wxfilename(ctm_output ? po.GetArg(3) : empty); Output ctm_writer(ctm_wxfilename, false); if (ctm_output) { ctm_writer.Stream() << std::fixed; - ctm_writer.Stream().precision(2); + ctm_writer.Stream().precision(frame_shift >= 0.01 ? 2 : 3); } int32 n_done = 0; diff --git a/src/bin/align-equal-compiled.cc b/src/bin/align-equal-compiled.cc index 663309a589b..c4ab9d4205a 100644 --- a/src/bin/align-equal-compiled.cc +++ b/src/bin/align-equal-compiled.cc @@ -39,7 +39,7 @@ int main(int argc, char *argv[]) { const char *usage = "Write an equally spaced alignment (for getting training started)" "Usage: align-equal-compiled \n" "e.g.: \n" - " align-equal-compiled 1.mdl 1.fsts scp:train.scp ark:equal.ali\n"; + " align-equal-compiled 1.fsts scp:train.scp ark:equal.ali\n"; ParseOptions po(usage); bool binary = true; diff --git a/src/bin/align-text.cc b/src/bin/align-text.cc index 04172f3b5f3..833e29efe3b 100644 --- a/src/bin/align-text.cc +++ b/src/bin/align-text.cc @@ -47,7 +47,9 @@ int main(int argc, char *argv[]) { "\n" "Usage: align-text [options] \\\n" " \n" - " e.g.: align-text ark:text1.txt ark:text2.txt ark,t:alignment.txt\n"; + " e.g.: align-text ark:text1.txt ark:text2.txt ark,t:alignment.txt\n" + "See also: compute-wer,\n" + "Example scoring script: egs/wsj/s5/steps/score_kaldi.sh\n"; ParseOptions po(usage); diff --git a/src/bin/am-info.cc b/src/bin/am-info.cc index e8cdc1977ec..6afb0c5014e 100644 --- a/src/bin/am-info.cc +++ b/src/bin/am-info.cc @@ -1,4 +1,4 @@ -// gmmbin/am-info.cc +// bin/am-info.cc // Copyright 2012-2013 Johns Hopkins University (Author: Daniel Povey) diff --git a/src/bin/analyze-counts.cc b/src/bin/analyze-counts.cc index 60be710c79d..80d43891696 100644 --- a/src/bin/analyze-counts.cc +++ b/src/bin/analyze-counts.cc @@ -1,6 +1,6 @@ // bin/analyze-counts.cc -// Copyright 2012-2014 Brno University of Technology (Author: Karel Vesely) +// Copyright 2012-2016 Brno University of Technology (Author: Karel Vesely) // See ../../COPYING for clarification regarding multiple authors // @@ -34,22 +34,38 @@ int main(int argc, char *argv[]) { try { const char *usage = "Computes element counts from integer vector table.\n" - "(e.g. for example to get pdf-counts to estimate DNN-output priors, for data analysis)\n" + "(e.g. get pdf-counts to estimate DNN-output priors " + "for data analysis)\n" "Verbosity : level 1 => print frequencies and histogram\n" "\n" - "Usage: analyze-counts [options] \n" + "Usage: analyze-counts [options] " + "\n" "e.g.: \n" " analyze-counts ark:1.ali prior.counts\n" " Show phone counts by:\n" - " ali-to-phone --per-frame=true ark:1.ali ark:- | analyze-counts --verbose=1 ark:- - >/dev/null\n"; - + " ali-to-phone --per-frame=true ark:1.ali ark:- |" + " analyze-counts --verbose=1 ark:- - >/dev/null\n"; + ParseOptions po(usage); - + bool binary = false; std::string symbol_table_filename = ""; - + po.Register("binary", &binary, "write in binary mode"); - po.Register("symbol-table", &symbol_table_filename, "Read symbol table for display of counts"); + po.Register("symbol-table", &symbol_table_filename, + "Read symbol table for display of counts"); + + int32 counts_dim = 0; + po.Register("counts-dim", &counts_dim, + "Output dimension of the counts, " + "a hint for dimension auto-detection."); + + std::string frame_weights; + po.Register("frame-weights", &frame_weights, + "Per-frame weights (counting weighted frames)."); + std::string utt_weights; + po.Register("utt-weights", &utt_weights, + "Per-utterance weights (counting weighted frames)."); po.Read(argc, argv); @@ -61,79 +77,121 @@ int main(int argc, char *argv[]) { std::string alignments_rspecifier = po.GetArg(1), wxfilename = po.GetArg(2); - SequentialInt32VectorReader reader(alignments_rspecifier); + SequentialInt32VectorReader alignment_reader(alignments_rspecifier); - // Get the counts - std::vector counts; - int32 num_done = 0; - for (; !reader.Done(); reader.Next()) { - std::string key = reader.Key(); - std::vector alignment = reader.Value(); + RandomAccessBaseFloatVectorReader weights_reader; + if (frame_weights != "") { + weights_reader.Open(frame_weights); + } + RandomAccessBaseFloatReader utt_weights_reader; + if (utt_weights != "") { + utt_weights_reader.Open(utt_weights); + } - for (size_t i = 0; i < alignment.size(); i++) { - int32 value = alignment[i]; - if(value >= counts.size()) { - counts.resize(value+1); + // Buffer for accumulating the counts + Vector counts(counts_dim, kSetZero); + + int32 num_done = 0, num_other_error = 0; + for (; !alignment_reader.Done(); alignment_reader.Next()) { + std::string utt = alignment_reader.Key(); + const std::vector &alignment = alignment_reader.Value(); + + BaseFloat utt_w = 1.0; + // Check if per-utterance weights are provided + if (utt_weights != "") { + if (!utt_weights_reader.HasKey(utt)) { + KALDI_WARN << utt << ", missing per-utterance weight"; + num_other_error++; + continue; + } else { + utt_w = utt_weights_reader.Value(utt); + } + } + + Vector frame_w; + // Check if per-frame weights are provided + if (frame_weights != "") { + if (!weights_reader.HasKey(utt)) { + KALDI_WARN << utt << ", missing per-frame weights"; + num_other_error++; + continue; + } else { + frame_w = weights_reader.Value(utt); + KALDI_ASSERT(frame_w.Dim() == alignment.size()); } - counts[value]++; // Accumulate } + // Accumulate the counts + for (size_t i = 0; i < alignment.size(); i++) { + KALDI_ASSERT(alignment[i] >= 0); + // Extend the vector if it is not large enough to hold every pdf-ids + if (alignment[i] >= counts.Dim()) { + counts.Resize(alignment[i]+1, kCopyData); + } + if (frame_weights != "") { + counts(alignment[i]) += 1.0 * utt_w * frame_w(i); + } else { + counts(alignment[i]) += 1.0 * utt_w; + } + } num_done++; } - // We need at least one occurence for each tgt, so there is no nan during decoding - std::vector counts_nozero(counts); - for(size_t i = 0; i < counts.size(); i++) { - if(counts_nozero[i] == 0) { - KALDI_WARN << "Zero count for element " << i << ", force setting to one." - << " This avoids divide-by-zero when we use the counts in decoding."; - counts_nozero[i]++; + // Report elements with zero counts + for (size_t i = 0; i < counts.Dim(); i++) { + if (0.0 == counts(i)) { + KALDI_WARN << "Zero count for label " << i << ", this is suspicious."; } } - // Write + // Add a ``half-frame'' to all the elements to + // avoid zero-counts which would cause problems in decoding + Vector counts_nozero(counts); + counts_nozero.Add(0.5); + Output ko(wxfilename, binary); - WriteIntegerVector(ko.Stream(), binary, counts_nozero); + counts_nozero.Write(ko.Stream(), binary); - //// - //// THE REST IS FOR ANALYSIS, IT GETS PRINTED TO LOG - //// + // + // THE REST IS FOR ANALYSIS, IT GETS PRINTED TO LOG + // if (symbol_table_filename != "" || (kaldi::g_kaldi_verbose_level >= 1)) { - // load the symbol table fst::SymbolTable *elem_syms = NULL; if (symbol_table_filename != "") { elem_syms = fst::SymbolTable::ReadText(symbol_table_filename); if (!elem_syms) - KALDI_ERR << "Could not read symbol table from file " << symbol_table_filename; + KALDI_ERR << "Could not read symbol table from file " + << symbol_table_filename; } - + // sort the counts - std::vector > sorted_counts; - for (int32 i = 0; i < counts.size(); i++) { - sorted_counts.push_back(std::make_pair(static_cast(counts[i]), i)); + std::vector > sorted_counts; + for (int32 i = 0; i < counts.Dim(); i++) { + sorted_counts.push_back( + std::make_pair(static_cast(counts(i)), i)); } std::sort(sorted_counts.begin(), sorted_counts.end()); - - // print std::ostringstream os; - int32 sum = std::accumulate(counts.begin(),counts.end(), 0); + double sum = counts.Sum(); os << "Printing...\n### The sorted count table," << std::endl; os << "count\t(norm),\tid\t(symbol):" << std::endl; - for (int32 i=0; i(sorted_counts[i].first) / sum << "),\t" - << sorted_counts[i].second << "\t" - << (elem_syms != NULL ? std::string("(")+elem_syms->Find(sorted_counts[i].second)+")" : "") + << sorted_counts[i].second << "\t" + << (elem_syms != NULL ? "(" + + elem_syms->Find(sorted_counts[i].second) + ")" : "") << std::endl; } - os << "\n#total " << sum - << " (" << static_cast(sum)/100/3600 << "h)" + os << "\n#total " << sum + << " (" << static_cast(sum)/100/3600 << "h)" << std::endl; KALDI_LOG << os.str(); } - KALDI_LOG << "Summed " << num_done << " int32 vectors to counts."; + KALDI_LOG << "Summed " << num_done << " int32 vectors to counts, " + << "skipped " << num_other_error << " vectors."; KALDI_LOG << "Counts written to " << wxfilename; return 0; } catch(const std::exception &e) { @@ -141,5 +199,3 @@ int main(int argc, char *argv[]) { return -1; } } - - diff --git a/src/bin/arpa2fst.cc b/src/bin/arpa2fst.cc deleted file mode 100755 index b118aba3f94..00000000000 --- a/src/bin/arpa2fst.cc +++ /dev/null @@ -1,62 +0,0 @@ -// bin/arpa2fst.cc -// -// Copyright 2009-2011 Gilles Boulianne. -// -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at - -// http://www.apache.org/licenses/LICENSE-2.0 - -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - -/// @addtogroup LanguageModel -/// @{ - -/** - * @file arpa2fst.cc - * @brief Example for converting an ARPA format language model into an FST. - * - */ - -#include -#include "lm/kaldi-lm.h" -#include "util/parse-options.h" - -int main(int argc, char *argv[]) { - try { - const char *usage = - "Converts an ARPA format language model into a FST\n" - "Usage: arpa2fst [opts] (input_arpa|-) [output_fst|-]\n"; - kaldi::ParseOptions po(usage); - - bool natural_base = true; - po.Register("natural-base", &natural_base, "Use log-base e (not log-base 10)"); - po.Read(argc, argv); - - if (po.NumArgs() != 1 && po.NumArgs() != 2) { - po.PrintUsage(); - exit(1); - } - std::string arpa_filename = po.GetArg(1), - fst_filename = po.GetOptArg(2); - - kaldi::LangModelFst lm; - // read from standard input and write to standard output - lm.Read(arpa_filename, kaldi::kArpaLm, NULL, natural_base); - lm.Write(fst_filename); - exit(0); - } catch(const std::exception &e) { - std::cerr << e.what(); - return -1; - } -} -/// @} - diff --git a/src/bin/compile-questions.cc b/src/bin/compile-questions.cc index 09225f58217..a6caafcc3f4 100644 --- a/src/bin/compile-questions.cc +++ b/src/bin/compile-questions.cc @@ -67,14 +67,23 @@ int main(int argc, char *argv[]) { " compile-questions questions.txt questions.qst\n"; bool binary = true; int32 P = 1, N = 3; - int32 num_iters_refine = 0; + int32 num_iters_refine = 0, + leftmost_questions_truncate = -1; ParseOptions po(usage); - po.Register("binary", &binary, "Write output in binary mode"); - po.Register("context-width", &N, "Context window size [must match acc-tree-stats]."); - po.Register("central-position", &P, "Central position in phone context window [must match acc-tree-stats]"); - po.Register("num-iters-refine", &num_iters_refine, "Number of iters of refining questions at each node. >0 --> questions not shared"); + po.Register("binary", &binary, + "Write output in binary mode"); + po.Register("context-width", &N, + "Context window size [must match acc-tree-stats]."); + po.Register("central-position", &P, + "Central position in phone context window [must match acc-tree-stats]"); + po.Register("num-iters-refine", &num_iters_refine, + "Number of iters of refining questions at each node. >0 --> questions " + "not refined"); + po.Register("leftmost-questions-truncate", &leftmost_questions_truncate, + "If > 0, the questions for the left-most context position will be " + "truncated to the specified number."); po.Read(argc, argv); @@ -118,9 +127,17 @@ int main(int argc, char *argv[]) { QuestionsForKey phone_opts(num_iters_refine); // the questions-options corresponding to keys 0, 1, .. N-1 which // represent the phonetic context positions (including the central phone). - phone_opts.initial_questions = questions; for (int32 n = 0; n < N; n++) { KALDI_LOG << "Setting questions for phonetic-context position "<< n; + if (n == 0 && leftmost_questions_truncate > 0 && + leftmost_questions_truncate < questions.size()) { + KALDI_LOG << "Truncating " << questions.size() << " to " + << leftmost_questions_truncate << " for position 0."; + phone_opts.initial_questions.assign( + questions.begin(), questions.begin() + leftmost_questions_truncate); + } else { + phone_opts.initial_questions = questions; + } qo.SetQuestionsOf(n, phone_opts); } diff --git a/src/bin/compute-wer-bootci.cc b/src/bin/compute-wer-bootci.cc new file mode 100644 index 00000000000..1166cae2421 --- /dev/null +++ b/src/bin/compute-wer-bootci.cc @@ -0,0 +1,254 @@ +// bin/compute-wer-bootci.cc + +// Copyright 2009-2011 Microsoft Corporation +// 2014 Johns Hopkins University (authors: Jan Trmal, Daniel Povey) +// 2015 Brno Universiry of technology (author: Karel Vesely) +// 2016 Nicolas Serrano + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + +#include "base/kaldi-common.h" +#include "util/common-utils.h" +#include "util/parse-options.h" +#include "tree/context-dep.h" +#include "util/edit-distance.h" +#include "base/kaldi-math.h" + +namespace kaldi { + +void GetEditsSingleHyp( const std::string &hyp_rspecifier, + const std::string &ref_rspecifier, + const std::string &mode, + std::vector > & edit_word_per_hyp) { + + // Both text and integers are loaded as vector of strings, + SequentialTokenVectorReader ref_reader(ref_rspecifier); + RandomAccessTokenVectorReader hyp_reader(hyp_rspecifier); + int32 num_words = 0, word_errs = 0, num_ins = 0, num_del = 0, num_sub = 0; + + // Main loop, store WER stats per hyp, + for (; !ref_reader.Done(); ref_reader.Next()) { + std::string key = ref_reader.Key(); + const std::vector &ref_sent = ref_reader.Value(); + std::vector hyp_sent; + if (!hyp_reader.HasKey(key)) { + if (mode == "strict") + KALDI_ERR << "No hypothesis for key " << key << " and strict " + "mode specifier."; + if (mode == "present") // do not score this one. + continue; + } else { + hyp_sent = hyp_reader.Value(key); + } + num_words = ref_sent.size(); + word_errs = LevenshteinEditDistance(ref_sent, hyp_sent, + &num_ins, &num_del, &num_sub); + edit_word_per_hyp.push_back(std::pair(word_errs, num_words)); + } +} + +void GetEditsDualHyp(const std::string &hyp_rspecifier, + const std::string &hyp_rspecifier2, + const std::string &ref_rspecifier, + const std::string &mode, + std::vector > & edit_word_per_hyp, + std::vector > & edit_word_per_hyp2) { + + // Both text and integers are loaded as vector of strings, + SequentialTokenVectorReader ref_reader(ref_rspecifier); + RandomAccessTokenVectorReader hyp_reader(hyp_rspecifier); + RandomAccessTokenVectorReader hyp_reader2(hyp_rspecifier2); + int32 num_words = 0, word_errs = 0, + num_ins = 0, num_del = 0, num_sub = 0; + + // Main loop, store WER stats per hyp, + for (; !ref_reader.Done(); ref_reader.Next()) { + std::string key = ref_reader.Key(); + const std::vector &ref_sent = ref_reader.Value(); + std::vector hyp_sent, hyp_sent2; + if (mode == "strict" && + (!hyp_reader.HasKey(key) || !hyp_reader2.HasKey(key))) { + KALDI_ERR << "No hypothesis for key " << key << " in both transcripts " + "comparison is not possible."; + } else if (mode == "present" && + (!hyp_reader.HasKey(key) || !hyp_reader2.HasKey(key))) + continue; + + num_words = ref_sent.size(); + + //all mode, if a hypothesis is not present, consider as an error + if(hyp_reader.HasKey(key)){ + hyp_sent = hyp_reader.Value(key); + word_errs = LevenshteinEditDistance(ref_sent, hyp_sent, + &num_ins, &num_del, &num_sub); + } + else + word_errs = num_words; + edit_word_per_hyp.push_back(std::pair(word_errs, num_words)); + + if(hyp_reader2.HasKey(key)){ + hyp_sent2 = hyp_reader2.Value(key); + word_errs = LevenshteinEditDistance(ref_sent, hyp_sent2, + &num_ins, &num_del, &num_sub); + } + else + word_errs = num_words; + edit_word_per_hyp2.push_back(std::pair(word_errs, num_words)); + } +} + +void GetBootstrapWERInterval( + const std::vector > & edit_word_per_hyp, + int32 replications, + BaseFloat *mean, BaseFloat *interval) { + BaseFloat wer_accum = 0.0, wer_mult_accum = 0.0; + + for (int32 i = 0; i <= replications; ++i) { + int32 num_words = 0, word_errs = 0; + for (int32 j = 0; j <= edit_word_per_hyp.size(); ++j) { + int32 random_pos = kaldi::RandInt(0, edit_word_per_hyp.size()); + word_errs += edit_word_per_hyp[random_pos].first; + num_words += edit_word_per_hyp[random_pos].second; + } + + BaseFloat wer_rep = static_cast(word_errs) / num_words; + wer_accum += wer_rep; + wer_mult_accum += wer_rep*wer_rep; + } + + // Compute mean WER and std WER + *mean = wer_accum / replications; + *interval = 1.96*sqrt(wer_mult_accum/replications-(*mean)*(*mean)); +} + +void GetBootstrapWERTwoSystemComparison( + const std::vector > & edit_word_per_hyp, + const std::vector > & edit_word_per_hyp2, + int32 replications, BaseFloat *p_improv) { + int32 improv_accum = 0.0; + + for (int32 i = 0; i <= replications; ++i) { + int32 word_errs = 0; + for (int32 j = 0; j <= edit_word_per_hyp.size(); ++j) { + int32 random_pos = kaldi::RandInt(0, edit_word_per_hyp.size()); + word_errs += edit_word_per_hyp[random_pos].first - + edit_word_per_hyp2[random_pos].first; + } + if(word_errs > 0) + ++improv_accum; + } + // Compute mean WER and std WER + *p_improv = static_cast(improv_accum) / replications; +} + +} //namespace kaldi + +int main(int argc, char *argv[]) { + using namespace kaldi; + typedef kaldi::int32 int32; + + try { + const char *usage = + "Compute a bootstrapping of WER to extract the 95\% confidence interval.\n" + "Take a reference and a transcription file, in integer or text format,\n" + "and outputs overall WER statistics to standard output along with its\n" + "confidence interval using the bootstrap methos of Bisani and Ney.\n" + "If a second transcription file corresponding to the same reference is\n" + "provided, a bootstrap comparison of the two transcription is performed\n" + "to estimate the probability of improvement.\n" + "\n" + "Usage: compute-wer-bootci [options] []\n" + "E.g.: compute-wer-bootci --mode=present ark:data/train/text ark:hyp_text\n" + "or compute-wer-bootci ark:data/train/text ark:hyp_text ark:hyp_text2\n" + "See also: compute-wer\n"; + + ParseOptions po(usage); + + std::string mode = "strict"; + po.Register("mode", &mode, + "Scoring mode: \"present\"|\"all\"|\"strict\":\n" + " \"present\" means score those we have transcriptions for\n" + " \"all\" means treat absent transcriptions as empty\n" + " \"strict\" means die if all in ref not also in hyp"); + + int32 replications = 10000; + po.Register("replications", &replications, + "Number of replications to compute the intervals"); + + po.Read(argc, argv); + + if (po.NumArgs() < 2 || po.NumArgs() > 3) { + po.PrintUsage(); + exit(1); + } + + std::string ref_rspecifier = po.GetArg(1); + std::string hyp_rspecifier = po.GetArg(2); + std::string hyp2_rspecifier = (po.NumArgs() == 3?po.GetArg(3):""); + + if (mode != "strict" && mode != "present" && mode != "all") { + KALDI_ERR << + "--mode option invalid: expected \"present\"|\"all\"|\"strict\", got " + << mode; + } + + //Get editions per each utterance + std::vector > edit_word_per_hyp, edit_word_per_hyp2; + if(hyp2_rspecifier.empty()) + GetEditsSingleHyp(hyp_rspecifier, ref_rspecifier, mode, edit_word_per_hyp); + else + GetEditsDualHyp(hyp_rspecifier, hyp2_rspecifier, ref_rspecifier, mode, + edit_word_per_hyp, edit_word_per_hyp2); + + //Extract WER for a number of replications of the same size + //as the hypothesis extracted + BaseFloat mean_wer = 0.0, interval = 0.0, + mean_wer2 = 0.0, interval2 = 0.0, + p_improv = 0.0; + + GetBootstrapWERInterval(edit_word_per_hyp, replications, + &mean_wer, &interval); + + if(!hyp2_rspecifier.empty()) { + GetBootstrapWERInterval(edit_word_per_hyp2, replications, + &mean_wer2, &interval2); + + GetBootstrapWERTwoSystemComparison(edit_word_per_hyp, edit_word_per_hyp2, + replications, &p_improv); + } + + // Print the output, + std::cout.precision(2); + std::cerr.precision(2); + std::cout << "Set1: %WER " << std::fixed << 100*mean_wer << + " 95\% Conf Interval [ " << 100*mean_wer-100*interval << + ", " << 100*mean_wer+100*interval << " ]" << '\n'; + + if(!hyp2_rspecifier.empty()) { + std::cout << "Set2: %WER " << std::fixed << 100*mean_wer2 << + " 95\% Conf Interval [ " << 100*mean_wer2-100*interval2 << + ", " << 100*mean_wer2+100*interval2 << " ]" << '\n'; + + std::cout << "Probability of Set2 improving Set1: " << std::fixed << + 100*p_improv << '\n'; + } + + return 0; + } catch(const std::exception &e) { + std::cerr << e.what(); + return -1; + } +} diff --git a/src/bin/compute-wer.cc b/src/bin/compute-wer.cc index 97e025d2c22..3d5b42c7f1d 100644 --- a/src/bin/compute-wer.cc +++ b/src/bin/compute-wer.cc @@ -2,6 +2,7 @@ // Copyright 2009-2011 Microsoft Corporation // 2014 Johns Hopkins University (authors: Jan Trmal, Daniel Povey) +// 2015 Brno Universiry of technology (author: Karel Vesely) // See ../../COPYING for clarification regarding multiple authors // @@ -24,42 +25,6 @@ #include "tree/context-dep.h" #include "util/edit-distance.h" - -namespace kaldi { - - -template -void PrintAlignmentStats(const std::vector &ref, - const std::vector &hyp, - T eps, - std::ostream &os) { - // Make sure the eps symbol is not in the sentences we're aligning; this would - // not make sense. - KALDI_ASSERT(std::find(ref.begin(), ref.end(), eps) == ref.end()); - KALDI_ASSERT(std::find(hyp.begin(), hyp.end(), eps) == hyp.end()); - - std::vector > aligned; - typedef typename std::vector >::const_iterator aligned_iterator; - - LevenshteinAlignment(ref, hyp, eps, &aligned); - for (aligned_iterator it = aligned.begin(); - it != aligned.end(); ++it) { - KALDI_ASSERT(!(it->first == eps && it->second == eps)); - if (it->first == eps) { - os << "insertion " << it->second << std::endl; - } else if (it->second == eps) { - os << "deletion " << it->first << std::endl; - } else if (it->first != it->second) { - os << "substitution " << it->first << ' ' << it->second << std::endl; - } else { - os << "correct " << it->first << std::endl; - } - } -} - -} - - int main(int argc, char *argv[]) { using namespace kaldi; typedef kaldi::int32 int32; @@ -69,29 +34,27 @@ int main(int argc, char *argv[]) { "Compute WER by comparing different transcriptions\n" "Takes two transcription files, in integer or text format,\n" "and outputs overall WER statistics to standard output.\n" - "Optionally, the third argument can be used to obtain detailed statistics\n" - "\n" - "Usage: compute-wer [options] []\n" "\n" + "Usage: compute-wer [options] \n" "E.g.: compute-wer --text --mode=present ark:data/train/text ark:hyp_text\n" - "or: compute-wer --text --mode=present ark:data/train/text ark:hyp_text - | \\\n" - " sort | uniq -c\n"; + "See also: align-text,\n" + "Example scoring script: egs/wsj/s5/steps/score_kaldi.sh\n"; ParseOptions po(usage); std::string mode = "strict"; - bool text_input = false; // if this is true, we expect symbols as strings, - po.Register("mode", &mode, "Scoring mode: \"present\"|\"all\"|\"strict\":\n" " \"present\" means score those we have transcriptions for\n" " \"all\" means treat absent transcriptions as empty\n" " \"strict\" means die if all in ref not also in hyp"); - po.Register("text", &text_input, "Expect strings, not integers, as input."); + + bool dummy = false; + po.Register("text", &dummy, "Deprecated option! Keeping for compatibility reasons."); po.Read(argc, argv); - if (po.NumArgs() < 2 || po.NumArgs() > 3) { + if (po.NumArgs() != 2) { po.PrintUsage(); exit(1); } @@ -99,103 +62,62 @@ int main(int argc, char *argv[]) { std::string ref_rspecifier = po.GetArg(1); std::string hyp_rspecifier = po.GetArg(2); - Output stats_output; - bool detailed_stats = (po.NumArgs() == 3); - if (detailed_stats) - stats_output.Open(po.GetOptArg(3), false, false); // non-binary output - if (mode != "strict" && mode != "present" && mode != "all") { KALDI_ERR << "--mode option invalid: expected \"present\"|\"all\"|\"strict\", got " << mode; } - - int32 num_words = 0, word_errs = 0, num_sent = 0, sent_errs = 0, num_ins = 0, num_del = 0, num_sub = 0, num_absent_sents = 0; - if (!text_input) { - SequentialInt32VectorReader ref_reader(ref_rspecifier); - RandomAccessInt32VectorReader hyp_reader(hyp_rspecifier); - - for (; !ref_reader.Done(); ref_reader.Next()) { - std::string key = ref_reader.Key(); - const std::vector &ref_sent = ref_reader.Value(); - std::vector hyp_sent; - if (!hyp_reader.HasKey(key)) { - if (mode == "strict") - KALDI_ERR << "No hypothesis for key " << key << " and strict " - "mode specifier."; - num_absent_sents++; - if (mode == "present") // do not score this one. - continue; - } else { - hyp_sent = hyp_reader.Value(key); - } - num_words += ref_sent.size(); - int32 ins, del, sub; - word_errs += LevenshteinEditDistance(ref_sent, hyp_sent, - &ins, &del, &sub); - num_ins += ins; - num_del += del; - num_sub += sub; - - if (detailed_stats) { - const int32 eps = -1; - PrintAlignmentStats(ref_sent, hyp_sent, eps, stats_output.Stream()); - } - num_sent++; - sent_errs += (ref_sent != hyp_sent); - } - } else { - SequentialTokenVectorReader ref_reader(ref_rspecifier); - RandomAccessTokenVectorReader hyp_reader(hyp_rspecifier); - - for (; !ref_reader.Done(); ref_reader.Next()) { - std::string key = ref_reader.Key(); - const std::vector &ref_sent = ref_reader.Value(); - std::vector hyp_sent; - if (!hyp_reader.HasKey(key)) { - if (mode == "strict") - KALDI_ERR << "No hypothesis for key " << key << " and strict " - "mode specifier."; - num_absent_sents++; - if (mode == "present") // do not score this one. - continue; - } else { - hyp_sent = hyp_reader.Value(key); - } - num_words += ref_sent.size(); - int32 ins, del, sub; - word_errs += LevenshteinEditDistance(ref_sent, hyp_sent, - &ins, &del, &sub); - num_ins += ins; - num_del += del; - num_sub += sub; - - if (detailed_stats) { - const std::string eps = ""; - PrintAlignmentStats(ref_sent, hyp_sent, eps, stats_output.Stream()); - } - num_sent++; - sent_errs += (ref_sent != hyp_sent); + // Both text and integers are loaded as vector of strings, + SequentialTokenVectorReader ref_reader(ref_rspecifier); + RandomAccessTokenVectorReader hyp_reader(hyp_rspecifier); + + // Main loop, accumulate WER stats, + for (; !ref_reader.Done(); ref_reader.Next()) { + std::string key = ref_reader.Key(); + const std::vector &ref_sent = ref_reader.Value(); + std::vector hyp_sent; + if (!hyp_reader.HasKey(key)) { + if (mode == "strict") + KALDI_ERR << "No hypothesis for key " << key << " and strict " + "mode specifier."; + num_absent_sents++; + if (mode == "present") // do not score this one. + continue; + } else { + hyp_sent = hyp_reader.Value(key); } + num_words += ref_sent.size(); + int32 ins, del, sub; + word_errs += LevenshteinEditDistance(ref_sent, hyp_sent, &ins, &del, &sub); + num_ins += ins; + num_del += del; + num_sub += sub; + + num_sent++; + sent_errs += (ref_sent != hyp_sent); } + // Compute WER, SER, BaseFloat percent_wer = 100.0 * static_cast(word_errs) / static_cast(num_words); + BaseFloat percent_ser = 100.0 * static_cast(sent_errs) + / static_cast(num_sent); + + // Print the ouptut, std::cout.precision(2); std::cerr.precision(2); std::cout << "%WER " << std::fixed << percent_wer << " [ " << word_errs << " / " << num_words << ", " << num_ins << " ins, " << num_del << " del, " << num_sub << " sub ]" << (num_absent_sents != 0 ? " [PARTIAL]" : "") << '\n'; - BaseFloat percent_ser = 100.0 * static_cast(sent_errs) - / static_cast(num_sent); std::cout << "%SER " << std::fixed << percent_ser << " [ " << sent_errs << " / " << num_sent << " ]\n"; std::cout << "Scored " << num_sent << " sentences, " << num_absent_sents << " not present in hyp.\n"; + return 0; } catch(const std::exception &e) { std::cerr << e.what(); diff --git a/src/bin/convert-ali.cc b/src/bin/convert-ali.cc index 97a503b26ab..3a52b7904a0 100644 --- a/src/bin/convert-ali.cc +++ b/src/bin/convert-ali.cc @@ -32,16 +32,24 @@ int main(int argc, char *argv[]) { try { const char *usage = "Convert alignments from one decision-tree/model to another\n" - "Usage: convert-ali [options] old-model new-model new-tree old-alignments-rspecifier new-alignments-wspecifier\n" + "Usage: convert-ali [options] " + " \n" "e.g.: \n" - " convert-ali old.mdl new.mdl new.tree ark:old.ali ark:new.ali\n"; + " convert-ali old/final.mdl new/0.mdl new/tree ark:old/ali.1 ark:new/ali.1\n"; + int32 frame_subsampling_factor = 1; + bool reorder = true; std::string phone_map_rxfilename; ParseOptions po(usage); po.Register("phone-map", &phone_map_rxfilename, "File name containing old->new phone mapping (each line is: " "old-integer-id new-integer-id)"); + po.Register("reorder", &reorder, + "True if you want the converted alignments to be 'reordered' " + "versus the way they appear in the HmmTopology object"); + po.Register("frame-subsampling-factor", &frame_subsampling_factor, + "Can be used in converting alignments to reduced frame rates."); po.Read(argc, argv); @@ -61,7 +69,7 @@ int main(int argc, char *argv[]) { ReadPhoneMap(phone_map_rxfilename, &phone_map); } - + SequentialInt32VectorReader alignment_reader(old_alignments_rspecifier); Int32VectorWriter alignment_writer(new_alignments_wspecifier); @@ -74,8 +82,8 @@ int main(int argc, char *argv[]) { if (!(old_trans_model.GetTopo() == new_trans_model.GetTopo())) KALDI_WARN << "Toplogies of models are not equal: " << "conversion may not be correct or may fail."; - - + + ContextDependency new_ctx_dep; // the tree. ReadKaldiObject(new_tree_filename, &new_ctx_dep); @@ -86,11 +94,13 @@ int main(int argc, char *argv[]) { const std::vector &old_alignment = alignment_reader.Value(); std::vector new_alignment; if (ConvertAlignment(old_trans_model, - new_trans_model, - new_ctx_dep, - old_alignment, - (phone_map_rxfilename != "" ? &phone_map : NULL), - &new_alignment)) { + new_trans_model, + new_ctx_dep, + old_alignment, + frame_subsampling_factor, + reorder, + (phone_map_rxfilename != "" ? &phone_map : NULL), + &new_alignment)) { alignment_writer.Write(key, new_alignment); num_success++; } else { @@ -101,7 +111,7 @@ int main(int argc, char *argv[]) { } KALDI_LOG << "Succeeded converting alignments for " << num_success - <<" files, failed for " << num_fail; + << " files, failed for " << num_fail; if (num_success != 0) return 0; else return 1; @@ -110,5 +120,3 @@ int main(int argc, char *argv[]) { return -1; } } - - diff --git a/src/bin/copy-transition-model.cc b/src/bin/copy-transition-model.cc index 2debe64a674..62a5d0c51dd 100644 --- a/src/bin/copy-transition-model.cc +++ b/src/bin/copy-transition-model.cc @@ -35,10 +35,10 @@ int main(int argc, char *argv[]) { " models from the acoustic models they are written with.\n" "Usage: copy-transition-model [options] \n" "e.g.: \n" - " copy-transition-model --binarhy=false 1.mdl 1.txt\n"; + " copy-transition-model --binary=false 1.mdl 1.txt\n"; bool binary; - + ParseOptions po(usage); po.Register("binary", &binary, "Write output in binary mode."); diff --git a/src/bin/decode-faster-mapped.cc b/src/bin/decode-faster-mapped.cc index 90c7125f927..c7411592504 100644 --- a/src/bin/decode-faster-mapped.cc +++ b/src/bin/decode-faster-mapped.cc @@ -160,7 +160,7 @@ int main(int argc, char *argv[]) { KALDI_LOG << "Overall log-likelihood per frame is " << (tot_like/frame_count) << " over " << frame_count << " frames."; - if (word_syms) delete word_syms; + delete word_syms; delete decode_fst; if (num_success != 0) return 0; else return 1; diff --git a/src/bin/decode-faster.cc b/src/bin/decode-faster.cc index 6e5851e12f7..cbcdb771d56 100644 --- a/src/bin/decode-faster.cc +++ b/src/bin/decode-faster.cc @@ -156,7 +156,7 @@ int main(int argc, char *argv[]) { KALDI_LOG << "Overall log-likelihood per frame is " << (tot_like/frame_count) << " over " << frame_count << " frames."; - if (word_syms) delete word_syms; + delete word_syms; delete decode_fst; if (num_success != 0) return 0; else return 1; diff --git a/src/bin/draw-tree.cc b/src/bin/draw-tree.cc index ed869ff250b..a534fdf78de 100644 --- a/src/bin/draw-tree.cc +++ b/src/bin/draw-tree.cc @@ -111,8 +111,8 @@ int main(int argc, char **argv) { renderer->Render(query); } - if (renderer) delete renderer; - if (query) delete query; + delete renderer; + delete query; } catch (const std::exception &e) { std::cerr << e.what(); return -1; diff --git a/src/bin/extract-ctx.cc b/src/bin/extract-ctx.cc index d3c36119581..b361a1d8707 100644 --- a/src/bin/extract-ctx.cc +++ b/src/bin/extract-ctx.cc @@ -30,7 +30,6 @@ #include "fst/fstlib.h" using namespace kaldi; - using std::vector; // Generate a string representation of the given EventType; the symtable is @@ -41,7 +40,7 @@ static std::string EventTypeToString(EventType &e, bool addpos) { // make sure it's sorted so that the kPdfClass is the first element! std::sort(e.begin(), e.end()); - + // first plot the pdf-class std::stringstream ss; ss << e[0].second; @@ -49,7 +48,7 @@ static std::string EventTypeToString(EventType &e, ss << " "; if (addpos) ss << (i-1) << ":"; - + if (phones_symtab == NULL) ss << e[i].second; else { @@ -69,6 +68,7 @@ static std::string EventTypeToString(EventType &e, int main(int argc, char *argv[]) { try { typedef kaldi::int32 int32; + const char *usage = "Given the tree stats and the resulting tree, output a mapping of phones\n" "in context (and pdf-class) to the pdf-id. This can be used to link the\n" @@ -77,16 +77,16 @@ int main(int argc, char *argv[]) { "e.g.: \n" " extract-ctx treeacc tree\n" " extract-ctx --mono 48 tree\n"; - + ParseOptions po(usage); - + std::string fsymboltab; bool addpos = false; bool mono = false; std::string silphones = "1,2,3"; int32 silpdfclasses = 5; int32 nonsilpdfclasses = 3; - + po.Register("mono", &mono, "Assume mono-phone tree; instead of tree stats, specify highest id"); po.Register("sil-phones", &silphones, @@ -100,12 +100,12 @@ int main(int argc, char *argv[]) { po.Register("add-position-indicators", &addpos, "Add position indicators for phonemes"); po.Read(argc, argv); - + if (po.NumArgs() != 2) { po.PrintUsage(); exit(1); } - + // read symtab if available fst::SymbolTable *phones_symtab = NULL; if (fsymboltab.length() > 0) { @@ -115,17 +115,17 @@ int main(int argc, char *argv[]) { if (!phones_symtab) KALDI_ERR << "Could not read phones symbol table file "<< fsymboltab; } - + // read the tree, get all the leaves ContextDependency ctx_dep; ReadKaldiObject(po.GetArg(2), &ctx_dep); const EventMap &map = ctx_dep.ToPdfMap(); - + // here we have to do different things for mono and tri+ trees if (mono) { // A mono-phone tree is not actually a real tree. We test for EventTypes // that have the central phone and the possible pdf-classes - + int32 maxs = atoi(po.GetArg(1).c_str()); if (phones_symtab != NULL) { size_t ns = phones_symtab->NumSymbols(); @@ -135,10 +135,10 @@ int main(int argc, char *argv[]) { maxs = (ns-1); } } - + // parse silphones std::set silset; - + std::string::size_type i1 = 0, i2; do { i2 = silphones.find(',', i1); @@ -148,19 +148,19 @@ int main(int argc, char *argv[]) { break; i1 = i2 + 1; } while (true); - - + + // now query each phone (ignore which is 0) for (int32 p = 1; p <= maxs; ++p) { int32 mpdf = (silset.find(p) == silset.end() ? nonsilpdfclasses : silpdfclasses); - + for (int i = 0; i < mpdf; ++i) { EventType et; et.push_back(std::pair(kPdfClass, i)); et.push_back(std::pair(0, p)); - + EventAnswerType ans; if (map.Map(et, &ans)) { std::cout << ans << " " @@ -173,12 +173,12 @@ int main(int argc, char *argv[]) { } } - + } else { // for tri+ trees, read the tree stats; this gives us basically all // phones-in-context that may be linked to an individual model // (in practice, many of them will be shared, but we plot them anyways) - + // build-tree-questions.h:typedef std::vector > BuildTreeStatsType BuildTreeStatsType stats; { @@ -188,9 +188,9 @@ int main(int argc, char *argv[]) { ReadBuildTreeStats(ki.Stream(), binary_in, gc, &stats); } KALDI_LOG << "Number of separate statistics is " << stats.size(); - + // typedef std::vector > EventType - + // now, for each tree stats element, query the tree to get the pdf-id for (size_t i = 0; i < stats.size(); ++i) { EventAnswerType ans; @@ -204,7 +204,7 @@ int main(int argc, char *argv[]) { } } } - + return 0; } catch(const std::exception &e) { std::cerr << e.what(); diff --git a/src/bin/latgen-faster-mapped-parallel.cc b/src/bin/latgen-faster-mapped-parallel.cc index 59e7f7170d1..dd4a3269cdf 100644 --- a/src/bin/latgen-faster-mapped-parallel.cc +++ b/src/bin/latgen-faster-mapped-parallel.cc @@ -170,7 +170,7 @@ int main(int argc, char *argv[]) { } sequencer.Wait(); - if (decode_fst != NULL) delete decode_fst; + delete decode_fst; double elapsed = timer.Elapsed(); KALDI_LOG << "Decoded with " << sequencer_config.num_threads << " threads."; @@ -182,7 +182,7 @@ int main(int argc, char *argv[]) { KALDI_LOG << "Overall log-likelihood per frame is " << (tot_like/frame_count) << " over " << frame_count<<" frames."; - if (word_syms) delete word_syms; + delete word_syms; if (num_success != 0) return 0; else return 1; } catch(const std::exception &e) { diff --git a/src/bin/latgen-faster-mapped.cc b/src/bin/latgen-faster-mapped.cc index 1ca62ca200c..8043bd31116 100644 --- a/src/bin/latgen-faster-mapped.cc +++ b/src/bin/latgen-faster-mapped.cc @@ -168,7 +168,7 @@ int main(int argc, char *argv[]) { KALDI_LOG << "Overall log-likelihood per frame is " << (tot_like/frame_count) << " over " << frame_count<<" frames."; - if (word_syms) delete word_syms; + delete word_syms; if (num_success != 0) return 0; else return 1; } catch(const std::exception &e) { diff --git a/src/bin/latgen-tracking-mapped.cc b/src/bin/latgen-tracking-mapped.cc index 46d2fca5f71..cf89cb17b94 100644 --- a/src/bin/latgen-tracking-mapped.cc +++ b/src/bin/latgen-tracking-mapped.cc @@ -202,7 +202,7 @@ int main(int argc, char *argv[]) { KALDI_LOG << "Overall log-likelihood per frame is " << (tot_like/frame_count) << " over " << frame_count<<" frames."; - if (word_syms) delete word_syms; + delete word_syms; if (num_success != 0) return 0; else return 1; } catch(const std::exception &e) { diff --git a/src/bin/matrix-sum.cc b/src/bin/matrix-sum.cc index 65078e26ff9..6c1d2ad9f12 100644 --- a/src/bin/matrix-sum.cc +++ b/src/bin/matrix-sum.cc @@ -110,7 +110,8 @@ int32 TypeTwoUsage(const ParseOptions &po, "matrix-sum: first argument must be an rspecifier"); // if next assert fails it would be bug in the code as otherwise we shouldn't // be called. - KALDI_ASSERT(ClassifyRspecifier(po.GetArg(2), NULL, NULL) == kNoRspecifier); + KALDI_ASSERT(ClassifyWspecifier(po.GetArg(2), NULL, NULL, NULL) == + kNoWspecifier); SequentialBaseFloatMatrixReader mat_reader(po.GetArg(1)); @@ -152,12 +153,17 @@ int32 TypeTwoUsage(const ParseOptions &po, int32 TypeThreeUsage(const ParseOptions &po, bool binary) { KALDI_ASSERT(po.NumArgs() >= 2); - for (int32 i = 1; i <= po.NumArgs(); i++) { - if (ClassifyRspecifier(po.GetArg(1), NULL, NULL) != kNoRspecifier) { + for (int32 i = 1; i < po.NumArgs(); i++) { + if (ClassifyRspecifier(po.GetArg(i), NULL, NULL) != kNoRspecifier) { KALDI_ERR << "Wrong usage (type 3): if first and last arguments are not " << "tables, the intermediate arguments must not be tables."; } } + if (ClassifyWspecifier(po.GetArg(po.NumArgs()), NULL, NULL, NULL) != + kNoWspecifier) { + KALDI_ERR << "Wrong usage (type 3): if first and last arguments are not " + << "tables, the intermediate arguments must not be tables."; + } bool add = true; Matrix mat; @@ -218,19 +224,19 @@ int main(int argc, char *argv[]) { int32 N = po.NumArgs(), exit_status; if (po.NumArgs() >= 2 && - ClassifyRspecifier(po.GetArg(N), NULL, NULL) != kNoRspecifier) { + ClassifyWspecifier(po.GetArg(N), NULL, NULL, NULL) != kNoWspecifier) { // output to table. exit_status = TypeOneUsage(po, scale1, scale2); } else if (po.NumArgs() == 2 && ClassifyRspecifier(po.GetArg(1), NULL, NULL) != kNoRspecifier && - ClassifyRspecifier(po.GetArg(N), NULL, NULL) == - kNoRspecifier) { + ClassifyWspecifier(po.GetArg(N), NULL, NULL, NULL) == + kNoWspecifier) { KALDI_ASSERT(scale1 == 1.0 && scale2 == 1.0); // input from a single table, output not to table. exit_status = TypeTwoUsage(po, binary); } else if (po.NumArgs() >= 2 && ClassifyRspecifier(po.GetArg(1), NULL, NULL) == kNoRspecifier && - ClassifyRspecifier(po.GetArg(N), NULL, NULL) == kNoRspecifier) { + ClassifyWspecifier(po.GetArg(N), NULL, NULL, NULL) == kNoWspecifier) { KALDI_ASSERT(scale1 == 1.0 && scale2 == 1.0); // summing flat files. exit_status = TypeThreeUsage(po, binary); diff --git a/src/bin/phones-to-prons.cc b/src/bin/phones-to-prons.cc index 2d3caddf78d..f9b9291a90b 100644 --- a/src/bin/phones-to-prons.cc +++ b/src/bin/phones-to-prons.cc @@ -27,7 +27,7 @@ #include "fstext/fstext-lib.h" // Create FST that accepts the phone sequence, with any number -// of word-start and word-end symbol in between each phone. +// of word-start and word-end symbol in between each phone. void CreatePhonesAltFst(const std::vector &phones, int32 word_start_sym, int32 word_end_sym, @@ -81,7 +81,7 @@ int main(int argc, char *argv[]) { "e.g.: \n" " ali-to-phones 1.mdl ark:1.ali ark:- | \\\n" " phones-to-prons L_align.fst 46 47 ark:- 1.tra ark:1.prons\n"; - + ParseOptions po(usage); po.Read(argc, argv); @@ -99,7 +99,7 @@ int main(int argc, char *argv[]) { prons_wspecifier = po.GetArg(6); int32 word_start_sym, word_end_sym; - + if (!ConvertStringToInteger(word_start_sym_str, &word_start_sym) || word_start_sym <= 0) KALDI_ERR << "Invalid word start symbol (expecting integer >= 0): " @@ -117,15 +117,15 @@ int main(int argc, char *argv[]) { fst::OLabelCompare olabel_comp; ArcSort(L, olabel_comp); } - + SequentialInt32VectorReader phones_reader(phones_rspecifier); RandomAccessInt32VectorReader words_reader(words_rspecifier); - + int32 n_done = 0, n_err = 0; - + std::string empty; Int32VectorVectorWriter prons_writer(prons_wspecifier); - + for (; !phones_reader.Done(); phones_reader.Next()) { std::string key = phones_reader.Key(); const std::vector &phones = phones_reader.Value(); @@ -163,7 +163,7 @@ int main(int argc, char *argv[]) { // on the input side, and words on the output side. VectorFst phnx2word; Compose(phones_alt_fst, phn2word, &phnx2word); - + if (phnx2word.Start() == fst::kNoStateId) { KALDI_WARN << "phnx2word FST for utterance " << key << "is empty (either decoding for this utterance did " @@ -196,7 +196,7 @@ int main(int argc, char *argv[]) { KALDI_ERR << "phnx2word is not a linear transducer (code error?)"; if (words2 != words) KALDI_ERR << "words have changed! (code error?)"; - + // Now, "phnx" should be the phone sequence with start and end // symbols included. At this point we break it up into segments, // and try to match it up with words. @@ -211,7 +211,7 @@ int main(int argc, char *argv[]) { continue; } prons_writer.Write(key, prons); - n_done++; + n_done++; } KALDI_LOG << "Done " << n_done << " utterances; " << n_err << " had errors."; } catch(const std::exception &e) { diff --git a/src/bin/sum-lda-accs.cc b/src/bin/sum-lda-accs.cc index 4988dfb57ca..22f11cc45ce 100644 --- a/src/bin/sum-lda-accs.cc +++ b/src/bin/sum-lda-accs.cc @@ -1,4 +1,4 @@ -// bin/sum-lda.cc +// bin/sum-lda-accs.cc // Copyright 2014 LINSE/UFSC; Augusto Henrique Hentz diff --git a/src/bin/sum-mllt-accs.cc b/src/bin/sum-mllt-accs.cc index 2bb43f0d112..4d580e21e55 100644 --- a/src/bin/sum-mllt-accs.cc +++ b/src/bin/sum-mllt-accs.cc @@ -1,4 +1,4 @@ -// bin/sum-mllt.cc +// bin/sum-mllt-accs.cc // Copyright 2014 LINSE/UFSC; Augusto Henrique Hentz diff --git a/src/bin/sum-tree-stats.cc b/src/bin/sum-tree-stats.cc index a4802a96847..fdaa3d178ca 100644 --- a/src/bin/sum-tree-stats.cc +++ b/src/bin/sum-tree-stats.cc @@ -76,7 +76,7 @@ int main(int argc, char *argv[]) { for (std::map::const_iterator iter = tree_stats.begin(); iter != tree_stats.end(); - iter++ ) { + ++iter) { stats.push_back(std::make_pair(iter->first, iter->second)); } tree_stats.clear(); diff --git a/src/bin/transform-vec.cc b/src/bin/transform-vec.cc index d79c8afdef8..4fd390a9ce7 100644 --- a/src/bin/transform-vec.cc +++ b/src/bin/transform-vec.cc @@ -1,4 +1,4 @@ -// featbin/transform-vec.cc +// bin/transform-vec.cc // Copyright 2009-2012 Microsoft Corporation // 2012-2014 Johns Hopkins University (author: Daniel Povey) diff --git a/src/bin/vector-sum.cc b/src/bin/vector-sum.cc index 70f5cec5b7b..42404e38384 100644 --- a/src/bin/vector-sum.cc +++ b/src/bin/vector-sum.cc @@ -101,13 +101,15 @@ int32 TypeOneUsage(const ParseOptions &po) { } int32 TypeTwoUsage(const ParseOptions &po, - bool binary) { + bool binary, + bool average = false) { KALDI_ASSERT(po.NumArgs() == 2); KALDI_ASSERT(ClassifyRspecifier(po.GetArg(1), NULL, NULL) != kNoRspecifier && "vector-sum: first argument must be an rspecifier"); // if next assert fails it would be bug in the code as otherwise we shouldn't // be called. - KALDI_ASSERT(ClassifyRspecifier(po.GetArg(2), NULL, NULL) == kNoRspecifier); + KALDI_ASSERT(ClassifyWspecifier(po.GetArg(2), NULL, NULL, NULL) == + kNoWspecifier); SequentialBaseFloatVectorReader vec_reader(po.GetArg(1)); @@ -132,6 +134,8 @@ int32 TypeTwoUsage(const ParseOptions &po, } } } + + if (num_done > 0 && average) sum.Scale(1.0 / num_done); Vector sum_float(sum); WriteKaldiObject(sum_float, po.GetArg(2), binary); @@ -147,12 +151,17 @@ int32 TypeTwoUsage(const ParseOptions &po, int32 TypeThreeUsage(const ParseOptions &po, bool binary) { KALDI_ASSERT(po.NumArgs() >= 2); - for (int32 i = 1; i <= po.NumArgs(); i++) { - if (ClassifyRspecifier(po.GetArg(1), NULL, NULL) != kNoRspecifier) { + for (int32 i = 1; i < po.NumArgs(); i++) { + if (ClassifyRspecifier(po.GetArg(i), NULL, NULL) != kNoRspecifier) { KALDI_ERR << "Wrong usage (type 3): if first and last arguments are not " << "tables, the intermediate arguments must not be tables."; } } + if (ClassifyWspecifier(po.GetArg(po.NumArgs()), NULL, NULL, NULL) != + kNoWspecifier) { + KALDI_ERR << "Wrong usage (type 3): if first and last arguments are not " + << "tables, the intermediate arguments must not be tables."; + } bool add = true; Vector vec; @@ -193,30 +202,32 @@ int main(int argc, char *argv[]) { " e.g.: vector-sum --binary=false 1.vec 2.vec 3.vec sum.vec\n" "See also: copy-vector, dot-weights\n"; - bool binary; + bool binary, average = false; ParseOptions po(usage); po.Register("binary", &binary, "If true, write output as binary (only " "relevant for usage types two or three"); + po.Register("average", &average, "Do average instead of sum"); po.Read(argc, argv); int32 N = po.NumArgs(), exit_status; if (po.NumArgs() >= 2 && - ClassifyRspecifier(po.GetArg(N), NULL, NULL) != kNoRspecifier) { + ClassifyWspecifier(po.GetArg(N), NULL, NULL, NULL) != kNoWspecifier) { // output to table. exit_status = TypeOneUsage(po); } else if (po.NumArgs() == 2 && ClassifyRspecifier(po.GetArg(1), NULL, NULL) != kNoRspecifier && - ClassifyRspecifier(po.GetArg(N), NULL, NULL) == - kNoRspecifier) { + ClassifyWspecifier(po.GetArg(N), NULL, NULL, NULL) == + kNoWspecifier) { // input from a single table, output not to table. - exit_status = TypeTwoUsage(po, binary); + exit_status = TypeTwoUsage(po, binary, average); } else if (po.NumArgs() >= 2 && ClassifyRspecifier(po.GetArg(1), NULL, NULL) == kNoRspecifier && - ClassifyRspecifier(po.GetArg(N), NULL, NULL) == kNoRspecifier) { + ClassifyWspecifier(po.GetArg(N), NULL, NULL, NULL) == + kNoWspecifier) { // summing flat files. exit_status = TypeThreeUsage(po, binary); } else { diff --git a/src/chain/Makefile b/src/chain/Makefile new file mode 100644 index 00000000000..d8fef6f6055 --- /dev/null +++ b/src/chain/Makefile @@ -0,0 +1,35 @@ + +all: + +include ../kaldi.mk +LDFLAGS += $(CUDA_LDFLAGS) +LDLIBS += $(CUDA_LDLIBS) + +TESTFILES = chain-supervision-test language-model-test + +OBJFILES = chain-supervision.o chain-numerator.o chain-den-graph.o \ + language-model.o chain-denominator.o chain-training.o +ifeq ($(CUDA), true) + OBJFILES += chain-kernels.o +endif + +LIBNAME = kaldi-chain + +ADDLIBS = ../lat/kaldi-lat.a ../hmm/kaldi-hmm.a ../tree/kaldi-tree.a \ + ../fstext/kaldi-fstext.a \ + ../cudamatrix/kaldi-cudamatrix.a ../matrix/kaldi-matrix.a \ + ../util/kaldi-util.a ../thread/kaldi-thread.a ../base/kaldi-base.a + +# Make sure we have CUDA_ARCH from kaldi.mk, +ifeq ($(CUDA), true) + ifndef CUDA_ARCH + $(error CUDA_ARCH is undefined, run 'src/configure') + endif +endif + +# Implicit rule for kernel compilation, +%.o : %.cu + $(CUDATKDIR)/bin/nvcc -c $< -o $@ $(CUDA_INCLUDE) $(CUDA_FLAGS) $(CUDA_ARCH) -I../ + +include ../makefiles/default_rules.mk + diff --git a/src/chain/chain-datastruct.h b/src/chain/chain-datastruct.h new file mode 100644 index 00000000000..52e388a3f2e --- /dev/null +++ b/src/chain/chain-datastruct.h @@ -0,0 +1,55 @@ +// chain/chain-datastruct.h + +// Copyright 2015 Johns Hopkins University (author: Daniel Povey) + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + +#ifndef KALDI_CHAIN_CHAIN_DATASTRUCT_H_ +#define KALDI_CHAIN_CHAIN_DATASTRUCT_H_ +#include "cudamatrix/cu-matrixdim.h" // for CU1DBLOCK and CU2DBLOCK, and int32_cuda + +/** + This header is for declaring "C" structures that are to be used in the + CUDA interface for things in this directory. We put it in a separate header from + the CUDA stuff as it may be needed regardless of whether we're actually compiling with + CUDA. + */ + +extern "C" { + // "C" version of the BaseFloat typedef-- this saves us having to write + // multiple versions of these kernels. +#if (KALDI_DOUBLEPRECISION != 0) + typedef double BaseFloat; +#else + typedef float BaseFloat; +#endif + + struct DenominatorGraphTransition { + BaseFloat transition_prob; // language-model part of the probability (not + // in log) + int32_cuda pdf_id; // pdf-id on the transition. + int32_cuda hmm_state; // source, or destination, HMM state. + }; + + + // Search for this in chain-kernels.cu for an explanation. + enum { kThresholdingPowerOfTwo = 14 }; + +} + + + +#endif diff --git a/src/chain/chain-den-graph.cc b/src/chain/chain-den-graph.cc new file mode 100644 index 00000000000..eaeac25046d --- /dev/null +++ b/src/chain/chain-den-graph.cc @@ -0,0 +1,389 @@ +// chain/chain-den-graph.cc + +// Copyright 2015 Johns Hopkins University (author: Daniel Povey) + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + + +#include "chain/chain-den-graph.h" +#include "hmm/hmm-utils.h" +#include "fstext/push-special.h" + +namespace kaldi { +namespace chain { + + +DenominatorGraph::DenominatorGraph(const fst::StdVectorFst &fst, + int32 num_pdfs): + num_pdfs_(num_pdfs) { + SetTransitions(fst, num_pdfs); + SetInitialProbs(fst); +} + +const Int32Pair* DenominatorGraph::BackwardTransitions() const { + return backward_transitions_.Data(); +} + +const Int32Pair* DenominatorGraph::ForwardTransitions() const { + return forward_transitions_.Data(); +} + +const DenominatorGraphTransition* DenominatorGraph::Transitions() const { + return transitions_.Data(); +} + +const CuVector& DenominatorGraph::InitialProbs() const { + return initial_probs_; +} + +void DenominatorGraph::SetTransitions(const fst::StdVectorFst &fst, + int32 num_pdfs) { + int32 num_states = fst.NumStates(); + + std::vector > + transitions_out(num_states), + transitions_in(num_states); + for (int32 s = 0; s < num_states; s++) { + for (fst::ArcIterator aiter(fst, s); !aiter.Done(); + aiter.Next()) { + const fst::StdArc &arc = aiter.Value(); + DenominatorGraphTransition transition; + transition.transition_prob = exp(-arc.weight.Value()); + transition.pdf_id = arc.ilabel - 1; + transition.hmm_state = arc.nextstate; + KALDI_ASSERT(transition.pdf_id >= 0 && transition.pdf_id < num_pdfs); + transitions_out[s].push_back(transition); + // now the reverse transition. + transition.hmm_state = s; + transitions_in[arc.nextstate].push_back(transition); + } + } + + std::vector forward_transitions(num_states); + std::vector backward_transitions(num_states); + std::vector transitions; + + for (int32 s = 0; s < num_states; s++) { + forward_transitions[s].first = static_cast(transitions.size()); + transitions.insert(transitions.end(), transitions_out[s].begin(), + transitions_out[s].end()); + forward_transitions[s].second = static_cast(transitions.size()); + } + for (int32 s = 0; s < num_states; s++) { + backward_transitions[s].first = static_cast(transitions.size()); + transitions.insert(transitions.end(), transitions_in[s].begin(), + transitions_in[s].end()); + backward_transitions[s].second = static_cast(transitions.size()); + } + + forward_transitions_ = forward_transitions; + backward_transitions_ = backward_transitions; + transitions_ = transitions; +} + +void DenominatorGraph::SetInitialProbs(const fst::StdVectorFst &fst) { + // we set only the start-state to have probability mass, and then 100 + // iterations of HMM propagation, over which we average the probabilities. + // initial probs won't end up making a huge difference as we won't be using + // derivatives from the first few frames, so this isn't 100% critical. + int32 num_iters = 100; + int32 num_states = fst.NumStates(); + + // we normalize each state so that it sums to one (including + // final-probs)... this is needed because the 'chain' code doesn't + // have transition probabilities. + Vector normalizing_factor(num_states); + for (int32 s = 0; s < num_states; s++) { + double tot_prob = exp(-fst.Final(s).Value()); + for (fst::ArcIterator aiter(fst, s); !aiter.Done(); + aiter.Next()) { + tot_prob += exp(-aiter.Value().weight.Value()); + } + KALDI_ASSERT(tot_prob > 0.0 && tot_prob < 100.0); + normalizing_factor(s) = 1.0 / tot_prob; + } + + Vector cur_prob(num_states), next_prob(num_states), + avg_prob(num_states); + cur_prob(fst.Start()) = 1.0; + for (int32 iter = 0; iter < num_iters; iter++) { + avg_prob.AddVec(1.0 / num_iters, cur_prob); + for (int32 s = 0; s < num_states; s++) { + double prob = cur_prob(s) * normalizing_factor(s); + + for (fst::ArcIterator aiter(fst, s); !aiter.Done(); + aiter.Next()) { + const fst::StdArc &arc = aiter.Value(); + next_prob(arc.nextstate) += prob * exp(-arc.weight.Value()); + } + } + cur_prob.Swap(&next_prob); + next_prob.SetZero(); + // Renormalize, beause the HMM won't sum to one even after the + // previous normalization (due to final-probs). + cur_prob.Scale(1.0 / cur_prob.Sum()); + } + + Vector avg_prob_float(avg_prob); + initial_probs_ = avg_prob_float; +} + +void DenominatorGraph::GetNormalizationFst(const fst::StdVectorFst &ifst, + fst::StdVectorFst *ofst) { + KALDI_ASSERT(ifst.NumStates() == initial_probs_.Dim()); + if (&ifst != ofst) + *ofst = ifst; + int32 new_initial_state = ofst->AddState(); + Vector initial_probs(initial_probs_); + + for (int32 s = 0; s < initial_probs_.Dim(); s++) { + BaseFloat initial_prob = initial_probs(s); + KALDI_ASSERT(initial_prob > 0.0); + fst::StdArc arc(0, 0, fst::TropicalWeight(-log(initial_prob)), s); + ofst->AddArc(new_initial_state, arc); + ofst->SetFinal(s, fst::TropicalWeight::One()); + } + ofst->SetStart(new_initial_state); + fst::RmEpsilon(ofst); + fst::ArcSort(ofst, fst::ILabelCompare()); +} + + +void MapFstToPdfIdsPlusOne(const TransitionModel &trans_model, + fst::StdVectorFst *fst) { + int32 num_states = fst->NumStates(); + for (int32 s = 0; s < num_states; s++) { + for (fst::MutableArcIterator aiter(fst, s); + !aiter.Done(); aiter.Next()) { + fst::StdArc arc = aiter.Value(); + KALDI_ASSERT(arc.ilabel == arc.olabel); + if (arc.ilabel > 0) { + arc.ilabel = trans_model.TransitionIdToPdf(arc.ilabel) + 1; + arc.olabel = arc.ilabel; + aiter.SetValue(arc); + } + } + } +} + +void MinimizeAcceptorNoPush(fst::StdVectorFst *fst) { + BaseFloat delta = fst::kDelta * 10.0; // use fairly loose delta for + // aggressive minimimization. + fst::ArcMap(fst, fst::QuantizeMapper(delta)); + fst::EncodeMapper encoder(fst::kEncodeLabels | fst::kEncodeWeights, + fst::ENCODE); + fst::Encode(fst, &encoder); + fst::AcceptorMinimize(fst); + fst::Decode(fst, encoder); +} + +// This static function, used in CreateDenominatorFst, sorts an +// fst's states in decreasing order of number of transitions (into + out of) +// the state. The aim is to have states that have a lot of transitions +// either into them or out of them, be numbered earlier, so hopefully +// they will be scheduled first and won't delay the computation +static void SortOnTransitionCount(fst::StdVectorFst *fst) { + // negative_num_transitions[i] will contain (before sorting), the pair + // ( -(num-transitions-into(i) + num-transition-out-of(i)), i) + int32 num_states = fst->NumStates(); + std::vector > negative_num_transitions(num_states); + for (int32 i = 0; i < num_states; i++) { + negative_num_transitions[i].first = 0; + negative_num_transitions[i].second = i; + } + for (int32 i = 0; i < num_states; i++) { + for (fst::ArcIterator aiter(*fst, i); !aiter.Done(); + aiter.Next()) { + negative_num_transitions[i].first--; + negative_num_transitions[aiter.Value().nextstate].first--; + } + } + std::sort(negative_num_transitions.begin(), negative_num_transitions.end()); + std::vector order(num_states); + for (int32 i = 0; i < num_states; i++) + order[negative_num_transitions[i].second] = i; + fst::StateSort(fst, order); +} + +void DenGraphMinimizeWrapper(fst::StdVectorFst *fst) { + for (int32 i = 1; i <= 3; i++) { + fst::PushSpecial(fst, fst::kDelta * 0.01); + MinimizeAcceptorNoPush(fst); + KALDI_LOG << "Number of states and arcs in transition-id FST after regular " + << "minimization is " << fst->NumStates() << " and " + << NumArcs(*fst) << " (pass " << i << ")"; + fst::StdVectorFst fst_reversed; + fst::Reverse(*fst, &fst_reversed); + fst::PushSpecial(&fst_reversed, fst::kDelta * 0.01); + MinimizeAcceptorNoPush(&fst_reversed); + fst::Reverse(fst_reversed, fst); + KALDI_LOG << "Number of states and arcs in transition-id FST after reversed " + << "minimization is " << fst->NumStates() << " and " + << NumArcs(*fst) << " (pass " << i << ")"; + } + fst::RmEpsilon(fst); + KALDI_LOG << "Number of states and arcs in transition-id FST after " + << "removing any epsilons introduced by reversal is " + << fst->NumStates() << " and " + << NumArcs(*fst); + fst::PushSpecial(fst, fst::kDelta * 0.01); +} + + +static void PrintDenGraphStats(const fst::StdVectorFst &den_graph) { + int32 num_states = den_graph.NumStates(); + int32 degree_cutoff = 3; // track states with <= transitions in/out. + int32 num_states_low_degree_in = 0, + num_states_low_degree_out = 0, + tot_arcs = 0; + std::vector num_in_arcs(num_states, 0); + for (int32 s = 0; s < num_states; s++) { + if (den_graph.NumArcs(s) <= degree_cutoff) { + num_states_low_degree_out++; + } + tot_arcs += den_graph.NumArcs(s); + for (fst::ArcIterator aiter(den_graph, s); + !aiter.Done(); aiter.Next()) { + int32 dest_state = aiter.Value().nextstate; + num_in_arcs[dest_state]++; + } + } + for (int32 s = 0; s < num_states; s++) { + if (num_in_arcs[s] <= degree_cutoff) { + num_states_low_degree_in++; + } + } + KALDI_LOG << "Number of states is " << num_states << " and arcs " + << tot_arcs << "; number of states with in-degree <= " + << degree_cutoff << " is " << num_states_low_degree_in + << " and with out-degree <= " << degree_cutoff + << " is " << num_states_low_degree_out; +} + + +// Check that every pdf is seen, warn if some are not. +static void CheckDenominatorFst(int32 num_pdfs, + const fst::StdVectorFst &den_fst) { + std::vector pdf_seen(num_pdfs); + int32 num_states = den_fst.NumStates(); + for (int32 s = 0; s < num_states; s++) { + for (fst::ArcIterator aiter(den_fst, s); + !aiter.Done(); aiter.Next()) { + int32 pdf_id = aiter.Value().ilabel - 1; + KALDI_ASSERT(pdf_id >= 0 && pdf_id < num_pdfs); + pdf_seen[pdf_id] = true; + } + } + for (int32 pdf = 0; pdf < num_pdfs; pdf++) { + if (!pdf_seen[pdf]) { + KALDI_WARN << "Pdf-id " << pdf << " is not seen in denominator graph."; + } + } +} + +void CreateDenominatorFst(const ContextDependency &ctx_dep, + const TransitionModel &trans_model, + const fst::StdVectorFst &phone_lm_in, + fst::StdVectorFst *den_fst) { + using fst::StdVectorFst; + using fst::StdArc; + KALDI_ASSERT(phone_lm_in.NumStates() != 0); + fst::StdVectorFst phone_lm(phone_lm_in); + + KALDI_LOG << "Number of states and arcs in phone-LM FST is " + << phone_lm.NumStates() << " and " << NumArcs(phone_lm); + + int32 subsequential_symbol = trans_model.GetPhones().back() + 1; + if (ctx_dep.CentralPosition() != ctx_dep.ContextWidth() - 1) { + // note: this function only adds the subseq symbol to the input of what was + // previously an acceptor, so we project, i.e. copy the ilabels to the + // olabels + AddSubsequentialLoop(subsequential_symbol, &phone_lm); + fst::Project(&phone_lm, fst::PROJECT_INPUT); + } + std::vector disambig_syms; // empty list of diambiguation symbols. + fst::ContextFst cfst(subsequential_symbol, trans_model.GetPhones(), + disambig_syms, ctx_dep.ContextWidth(), + ctx_dep.CentralPosition()); + StdVectorFst context_dep_lm; + fst::ComposeContextFst(cfst, phone_lm, &context_dep_lm); + // at this point, context_dep_lm will have indexes into 'ilabels' as its + // input symbol (representing context-dependent phones), and phones on its + // output. We don't need the phones, so we'll project. + fst::Project(&context_dep_lm, fst::PROJECT_INPUT); + + KALDI_LOG << "Number of states and arcs in context-dependent LM FST is " + << context_dep_lm.NumStates() << " and " << NumArcs(context_dep_lm); + + std::vector disambig_syms_h; // disambiguation symbols on input side + // of H -- will be empty. + HTransducerConfig h_config; + // the default is 1, but just document that we want this to stay as one. + // we'll use the same value in test time. Consistency is the key here. + h_config.transition_scale = 1.0; + h_config.push_weights = true; + + StdVectorFst *h_fst = GetHTransducer(cfst.ILabelInfo(), + ctx_dep, + trans_model, + h_config, + &disambig_syms_h); + KALDI_ASSERT(disambig_syms_h.empty()); + StdVectorFst transition_id_fst; + TableCompose(*h_fst, context_dep_lm, &transition_id_fst); + delete h_fst; + + BaseFloat self_loop_scale = 1.0; // We have to be careful to use the same + // value in test time. + bool reorder = false; + // add self-loops to the FST with transition-ids as its labels. + AddSelfLoops(trans_model, disambig_syms_h, self_loop_scale, reorder, + &transition_id_fst); + // at this point transition_id_fst will have transition-ids as its ilabels and + // context-dependent phones (indexes into ILabelInfo()) as its olabels. + // Discard the context-dependent phones by projecting on the input, keeping + // only the transition-ids. + fst::Project(&transition_id_fst, fst::PROJECT_INPUT); + + MapFstToPdfIdsPlusOne(trans_model, &transition_id_fst); + KALDI_LOG << "Number of states and arcs in transition-id FST is " + << transition_id_fst.NumStates() << " and " + << NumArcs(transition_id_fst); + + // RemoveEpsLocal doesn't remove all epsilons, but it keeps the graph small. + fst::RemoveEpsLocal(&transition_id_fst); + // If there are remaining epsilons, remove them. + fst::RmEpsilon(&transition_id_fst); + KALDI_LOG << "Number of states and arcs in transition-id FST after " + << "removing epsilons is " + << transition_id_fst.NumStates() << " and " + << NumArcs(transition_id_fst); + + DenGraphMinimizeWrapper(&transition_id_fst); + + SortOnTransitionCount(&transition_id_fst); + + *den_fst = transition_id_fst; + CheckDenominatorFst(trans_model.NumPdfs(), *den_fst); + PrintDenGraphStats(*den_fst); +} + + +int32 DenominatorGraph::NumStates() const { + return forward_transitions_.Dim(); +} +} // namespace chain +} // namespace kaldi diff --git a/src/chain/chain-den-graph.h b/src/chain/chain-den-graph.h new file mode 100644 index 00000000000..b2510651f39 --- /dev/null +++ b/src/chain/chain-den-graph.h @@ -0,0 +1,168 @@ +// chain/chain-den-graph.h + +// Copyright 2015 Johns Hopkins University (Author: Daniel Povey) + + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + + +#ifndef KALDI_CHAIN_CHAIN_DEN_GRAPH_H_ +#define KALDI_CHAIN_CHAIN_DEN_GRAPH_H_ + +#include +#include + +#include "base/kaldi-common.h" +#include "util/common-utils.h" +#include "fstext/fstext-lib.h" +#include "tree/context-dep.h" +#include "lat/kaldi-lattice.h" +#include "matrix/kaldi-matrix.h" +#include "chain/chain-datastruct.h" +#include "hmm/transition-model.h" +#include "cudamatrix/cu-matrix.h" +#include "cudamatrix/cu-vector.h" +#include "cudamatrix/cu-array.h" + +namespace kaldi { +namespace chain { + + +/** This class is responsible for storing the FST that we use as the + 'anti-model' or 'denominator-model', that models all possible phone + sequences (or most possible phone sequences, depending how we built it).. + It stores the FST in a format where we can access both the transitions out + of each state, and the transitions into each state. + + This class supports both GPU and non-GPU operation, but is optimized for + GPU. + */ +class DenominatorGraph { + public: + + // the number of states in the HMM. + int32 NumStates() const; + + // the number of PDFs (the labels on the transitions are numbered from 0 to + // NumPdfs() - 1). + int32 NumPdfs() const { return num_pdfs_; } + + DenominatorGraph(); + + // Initialize from epsilon-free acceptor FST with pdf-ids plus one as the + // labels. 'num_pdfs' is only needeed for checking. + DenominatorGraph(const fst::StdVectorFst &fst, + int32 num_pdfs); + + // returns the pointer to the forward-transitions array, indexed by hmm-state, + // which will be on the GPU if we're using a GPU. + const Int32Pair *ForwardTransitions() const; + + // returns the pointer to the backward-transitions array, indexed by + // hmm-state, which will be on the GPU if we're using a GPU. + const Int32Pair *BackwardTransitions() const; + + // returns the array to the actual transitions (this is indexed by the ranges + // returned from the ForwardTransitions and BackwardTransitions arrays). The + // memory will be GPU memory if we are using a GPU. + const DenominatorGraphTransition *Transitions() const; + + // returns the initial-probs of the HMM-states... note, these initial-probs + // don't mean initial at the start of the file, because we usually train on + // pieces of a file. They are approximate initial-probs obtained by running + // the HMM for a fixed number of time-steps (e.g. 100) and averaging the + // posteriors over those time-steps. The exact values won't be very critical. + // Note: we renormalize each HMM-state to sum to one before doing this. + const CuVector &InitialProbs() const; + + // This function outputs a modifified version of the FST that was used to + // build this object, that has an initial-state with epsilon transitions to + // each state, with weight determined by initial_probs_; and has each original + // state being final with probability one (note: we remove epsilons). This is + // used in computing the 'penalty_logprob' of the Supervision objects, to + // ensure that the objective function is never positive, which makes it more + // easily interpretable. 'ifst' must be the same FST that was provided to the + // constructor of this object. [note: ifst and ofst may be the same object.] + // This function ensures that 'ofst' is ilabel sorted (which will be useful in + // composition). + void GetNormalizationFst(const fst::StdVectorFst &ifst, + fst::StdVectorFst *ofst); + + // This function is only used in testing code. + void ScaleInitialProbs(BaseFloat s) { initial_probs_.Scale(s); } + + // Use default copy constructor and assignment operator. + private: + // functions called from the constructor + void SetTransitions(const fst::StdVectorFst &fst, int32 num_pfds); + + // work out the initial-probs. Note, there are no final-probs; we treat all + // states as final with probability one [we have a justification for this.. + // assuming it's roughly a well-normalized HMM, this makes sense; note that we + // train on chunks, so the beginning and end of a chunk appear at arbitrary + // points in the sequence. At both beginning and end of the chunk, we limit + // ourselves to only those pdf-ids that were allowed in the numerator + // sequence. + void SetInitialProbs(const fst::StdVectorFst &fst); + + // forward_transitions_ is an array, indexed by hmm-state index, + // of start and end indexes into the transition_ array, which + // give us the set of transitions out of this state. + CuArray forward_transitions_; + // backward_transitions_ is an array, indexed by hmm-state index, + // of start and end indexes into the transition_ array, which + // give us the set of transitions out of this state. + CuArray backward_transitions_; + // This stores the actual transitions. + CuArray transitions_; + + // The initial-probability of all states, used on the first frame of a + // sequence [although we also apply the constraint that on the first frame, + // only pdf-ids that were active on the 1st frame of the numerator, are + // active. Because in general sequences won't start at the start of files, we + // make this a generic probability distribution close to the limiting + // distribution of the HMM. This isn't too critical. + CuVector initial_probs_; + + int32 num_pdfs_; +}; + + +// Function that does acceptor minimization without weight pushing... +// this is useful when constructing the denominator graph. +void MinimizeAcceptorNoPush(fst::StdVectorFst *fst); + +// Utility function used while building the graph. Converts +// transition-ids to pdf-ids plus one. Assumes 'fst' +// is an acceptor, but does not check this (only looks at its +// ilabels). +void MapFstToPdfIdsPlusOne(const TransitionModel &trans_model, + fst::StdVectorFst *fst); + +// Starting from an acceptor on phones that represents some kind of compiled +// language model (with no disambiguation symbols), this funtion creates the +// denominator-graph. Note: there is similar code in chain-supervision.cc, when +// creating the supervision graph. +void CreateDenominatorFst(const ContextDependency &ctx_dep, + const TransitionModel &trans_model, + const fst::StdVectorFst &phone_lm, + fst::StdVectorFst *den_graph); + + +} // namespace chain +} // namespace kaldi + +#endif // KALDI_CHAIN_CHAIN_DEN_GRAPH_H_ diff --git a/src/chain/chain-denominator.cc b/src/chain/chain-denominator.cc new file mode 100644 index 00000000000..b0bdc43ae97 --- /dev/null +++ b/src/chain/chain-denominator.cc @@ -0,0 +1,429 @@ +// chain/chain-denominator.cc + +// Copyright 2015 Johns Hopkins University (author: Daniel Povey) + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + + +#include "chain/chain-denominator.h" +#include "chain/chain-kernels-ansi.h" + +namespace kaldi { +namespace chain { + +DenominatorComputation::DenominatorComputation( + const ChainTrainingOptions &opts, + const DenominatorGraph &den_graph, + int32 num_sequences, + const CuMatrixBase &nnet_output): + opts_(opts), + den_graph_(den_graph), + num_sequences_(num_sequences), + frames_per_sequence_(nnet_output.NumRows() / num_sequences_), + exp_nnet_output_transposed_(nnet_output, kTrans), + nnet_output_deriv_transposed_( + exp_nnet_output_transposed_.NumRows(), + std::min(exp_nnet_output_transposed_.NumCols(), + static_cast(kMaxDerivTimeSteps) * + num_sequences_)), + alpha_(frames_per_sequence_ + 1, + den_graph_.NumStates() * num_sequences_ + num_sequences_, + kUndefined), + beta_(2, den_graph_.NumStates() * num_sequences_ + num_sequences_, + kUndefined), + tot_prob_(num_sequences_, kUndefined), + tot_log_prob_(num_sequences_, kUndefined), + log_correction_term_(num_sequences_, kUndefined), + ok_(true) { + KALDI_ASSERT(opts_.leaky_hmm_coefficient > 0.0 && + opts_.leaky_hmm_coefficient < 1.0); + // make sure the alpha sums and beta sums are zeroed. + alpha_.ColRange(den_graph_.NumStates() * num_sequences_, + num_sequences_).SetZero(); + beta_.ColRange(den_graph_.NumStates() * num_sequences_, + num_sequences_).SetZero(); + + KALDI_ASSERT(nnet_output.NumRows() % num_sequences == 0); + exp_nnet_output_transposed_.ApplyExp(); +} + + +void DenominatorComputation::AlphaFirstFrame() { + // dim == num_hmm_states_ * num_sequences_. + BaseFloat *first_frame_alpha = alpha_.RowData(0); + // create a 'fake matrix' - view this row as a matrix. + // initializer takes [pointer, num-rows, num-cols, stride]. + CuSubMatrix alpha_mat(first_frame_alpha, + den_graph_.NumStates(), + num_sequences_, + num_sequences_); + // TODO (possible): It would be more efficient here if we implemented a + // CopyColsFromVec function in class CuMatrix. + alpha_mat.SetZero(); + alpha_mat.AddVecToCols(1.0, den_graph_.InitialProbs(), 0.0); +} + + +// the alpha computation for some 0 < t <= num_time_steps_. +void DenominatorComputation::AlphaGeneralFrame(int32 t) { + KALDI_ASSERT(t > 0 && t <= frames_per_sequence_); + BaseFloat *this_alpha = alpha_.RowData(t); + const BaseFloat *prev_alpha_dash = alpha_.RowData(t - 1); + const Int32Pair *backward_transitions = den_graph_.BackwardTransitions(); + const DenominatorGraphTransition *transitions = den_graph_.Transitions(); + int32 num_pdfs = exp_nnet_output_transposed_.NumRows(), + num_hmm_states = den_graph_.NumStates(), + num_sequences = num_sequences_; + + // 'probs' is the matrix of pseudo-likelihoods for frame t - 1. + CuSubMatrix probs(exp_nnet_output_transposed_, 0, num_pdfs, + (t-1) * num_sequences_, num_sequences_); + const BaseFloat *prob_data = probs.Data(); + +#if HAVE_CUDA == 1 + if (CuDevice::Instantiate().Enabled()) { + Timer tim; + dim3 dimBlock(std::min(CU1DBLOCK, num_sequences), 1, 1); + dim3 dimGrid(n_blocks(num_sequences, dimBlock.x), num_hmm_states, 1); + + while (1) { + if (dimGrid.y > 65535) // the hardware doesn't allow more than this. + dimGrid.y = 65535; + cuda_chain_hmm_forward(dimGrid, dimBlock, + backward_transitions, transitions, + num_sequences, den_graph_.NumStates(), + prob_data, probs.Stride(), prev_alpha_dash, + this_alpha); + CU_SAFE_CALL(cudaGetLastError()); + if (dimGrid.y == num_hmm_states) { + break; // this is the normal case. + } else { + // We reach this code only in the unusual case where num_hmm_states > + // 65535. We can compute the alphas for the remaining HMM states by + // moving some of the array pointers and making the call again. + backward_transitions += dimGrid.y; + this_alpha += dimGrid.y * num_sequences; + num_hmm_states -= dimGrid.y; + dimGrid.y = num_hmm_states; + } + } + CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); + } else +#endif + { + int32 prob_stride = probs.Stride(); + for (int32 h = 0; h < num_hmm_states; h++) { + for (int32 s = 0; s < num_sequences; s++) { + double this_tot_alpha = 0.0; + const DenominatorGraphTransition + *trans_iter = transitions + backward_transitions[h].first, + *trans_end = transitions + backward_transitions[h].second; + for (; trans_iter != trans_end; ++trans_iter) { + BaseFloat transition_prob = trans_iter->transition_prob; + int32 pdf_id = trans_iter->pdf_id, + prev_hmm_state = trans_iter->hmm_state; + BaseFloat prob = prob_data[pdf_id * prob_stride + s], + this_prev_alpha = prev_alpha_dash[prev_hmm_state * num_sequences + s]; + this_tot_alpha += this_prev_alpha * transition_prob * prob; + } + // Let arbitrary_scale be the inverse of the alpha-sum value that we + // store in the same place we'd store the alpha for the state numbered + // 'num_hmm_states'. We multiply this into all the + // transition-probabilities from the previous frame to this frame, in + // both the forward and backward passes, in order to keep the alphas in + // a good numeric range. This won't affect the posteriors, but when + // computing the total likelihood we'll need to compensate for it later + // on. + BaseFloat arbitrary_scale = + 1.0 / prev_alpha_dash[num_hmm_states * num_sequences + s]; + KALDI_ASSERT(this_tot_alpha - this_tot_alpha == 0); + this_alpha[h * num_sequences + s] = this_tot_alpha * arbitrary_scale; + } + } + } +} + +void DenominatorComputation::AlphaDash(int32 t) { + BaseFloat *this_alpha = alpha_.RowData(t); + + // create a 'fake matrix' for the regular alphas- view this row as a matrix. + // initializer takes [pointer, num-rows, num-cols, stride]. + CuSubMatrix alpha_mat(this_alpha, + den_graph_.NumStates(), + num_sequences_, + num_sequences_); + + // the alpha-dash is the sum of alpha over all states. + CuSubVector alpha_sum_vec(this_alpha + + den_graph_.NumStates() * num_sequences_, + num_sequences_); + alpha_sum_vec.AddRowSumMat(1.0, alpha_mat, 0.0); + + alpha_mat.AddVecVec(opts_.leaky_hmm_coefficient, + den_graph_.InitialProbs(), + alpha_sum_vec); + // it's now alpha-dash. +} + +// compute beta from beta-dash. +void DenominatorComputation::Beta(int32 t) { + BaseFloat *this_beta_dash = beta_.RowData(t % 2); + // create a 'fake matrix' for the regular beta-dash (which is + // the counterpart of alpha-dash)- view this row as a matrix. + // initializer takes [pointer, num-rows, num-cols, stride]. + CuSubMatrix beta_dash_mat(this_beta_dash, + den_graph_.NumStates(), + num_sequences_, + num_sequences_); + // making the t index implicit, the beta-dash-sum for each sequence is the sum + // over all states i of beta_i * opts_.leaky_hmm_coefficient * initial_prob_i. + CuSubVector beta_dash_sum_vec( + this_beta_dash + den_graph_.NumStates() * num_sequences_, + num_sequences_); + beta_dash_sum_vec.AddMatVec(opts_.leaky_hmm_coefficient, beta_dash_mat, + kTrans, den_graph_.InitialProbs(), 0.0); + // we are computing beta in place. After the following, beta-dash-mat + // will contain the actual beta (i.e. the counterpart of alpha), + // not the beta-dash. + beta_dash_mat.AddVecToRows(1.0, beta_dash_sum_vec); +} + +BaseFloat DenominatorComputation::Forward() { + AlphaFirstFrame(); + AlphaDash(0); + for (int32 t = 1; t <= frames_per_sequence_; t++) { + AlphaGeneralFrame(t); + AlphaDash(t); + } + return ComputeTotLogLike(); +} + +BaseFloat DenominatorComputation::ComputeTotLogLike() { + tot_prob_.Resize(num_sequences_); + // View the last alpha-dash as a matrix of size num-hmm-states by num-sequences. + CuSubMatrix last_alpha_dash( + alpha_.RowData(frames_per_sequence_), + den_graph_.NumStates(), + num_sequences_, + num_sequences_); + + tot_prob_.AddRowSumMat(1.0, last_alpha_dash, 0.0); + // we should probably add an ApplyLog() function that takes a vector argument. + tot_log_prob_ = tot_prob_; + tot_log_prob_.ApplyLog(); + BaseFloat tot_log_prob = tot_log_prob_.Sum(); + + // We now have to add something for the arbitrary scaling factor. [note: the + // purpose of the arbitrary scaling factors was to keep things in a good + // floating-point range] + // The inverses of all the tot-alpha quantities, for t = 0 + // ... frames_per_sequence_ - 1, were included as the 'arbitrary factors' in + // the transition-probs, so we need to multiply them all together (not + // inversed) and add them as a correction term to the total log-likes. + // These tot-alpha quantities were stored in the same place that we would + // have stored the HMM-state numbered 'num_hmm_states'. + int32 num_hmm_states = den_graph_.NumStates(); + CuSubMatrix inv_arbitrary_scales( + alpha_, 0, frames_per_sequence_, + num_sequences_ * num_hmm_states, num_sequences_); + CuMatrix log_inv_arbitrary_scales( + inv_arbitrary_scales); + log_inv_arbitrary_scales.ApplyLog(); + BaseFloat log_inv_arbitrary_scales_product = + log_inv_arbitrary_scales.Sum(); + return tot_log_prob + log_inv_arbitrary_scales_product; +} + + + +bool DenominatorComputation::Backward( + BaseFloat deriv_weight, + CuMatrixBase *nnet_output_deriv) { + BetaDashLastFrame(); + Beta(frames_per_sequence_); + for (int32 t = frames_per_sequence_ - 1; t >= 0; t--) { + BetaDashGeneralFrame(t); + if (GetVerboseLevel() >= 1 || t == 0) + BetaGeneralFrameDebug(t); + Beta(t); + if (t % kMaxDerivTimeSteps == 0) { + // commit the derivative stored in exp_nnet_output_transposed_ by adding + // its transpose to the appropriate sub-matrix of 'nnet_output_deriv'. + int32 chunk_frames = std::min(static_cast(kMaxDerivTimeSteps), + frames_per_sequence_ - t), + num_pdfs = exp_nnet_output_transposed_.NumRows(); + CuSubMatrix transposed_deriv_part( + nnet_output_deriv_transposed_, + 0, num_pdfs, + 0, chunk_frames * num_sequences_); + CuSubMatrix output_deriv_part( + *nnet_output_deriv, + t * num_sequences_, chunk_frames * num_sequences_, + 0, num_pdfs); + output_deriv_part.AddMat(deriv_weight, transposed_deriv_part, kTrans); + if (t != 0) + transposed_deriv_part.SetZero(); + } + } + return ok_; +} + +void DenominatorComputation::BetaDashLastFrame() { + // sets up the beta-dash quantity on the last frame (frame == + // frames_per_sequence_). Note that the betas we use here contain a + // 1/(tot-prob) factor in order to simplify the backprop. + + int32 t = frames_per_sequence_; + BaseFloat *last_frame_beta_dash = beta_.RowData(t % 2); + + // create a 'fake matrix' - view this row as a matrix. + CuSubMatrix beta_dash_mat(last_frame_beta_dash, + den_graph_.NumStates(), + num_sequences_, + num_sequences_); + CuVector inv_tot_prob(tot_prob_); + inv_tot_prob.InvertElements(); + // the beta values at the end of the file only vary with the sequence-index, + // not with the HMM-index. We treat all states as having a final-prob of one. + beta_dash_mat.CopyRowsFromVec(inv_tot_prob); +} + +void DenominatorComputation::BetaDashGeneralFrame(int32 t) { + KALDI_ASSERT(t >= 0 && t < frames_per_sequence_); + int32 num_pdfs = exp_nnet_output_transposed_.NumRows(); + // t_wrapped gives us the time-index we use when indexing + // nnet_output_deriv_transposed_; to save memory we limit the size of the + // matrix, storing only chunks of frames at a time, and we add it to the + // non-transposed output whenever we finish a chunk. + int32 t_wrapped = t % static_cast(kMaxDerivTimeSteps); + const BaseFloat *this_alpha_dash = alpha_.RowData(t), + *next_beta = beta_.RowData((t + 1) % 2); + BaseFloat *this_beta_dash = beta_.RowData(t % 2); + const Int32Pair *forward_transitions = den_graph_.ForwardTransitions(); + const DenominatorGraphTransition *transitions = den_graph_.Transitions(); + // 'probs' is the matrix of pseudo-likelihoods for frame t. + CuSubMatrix probs(exp_nnet_output_transposed_, 0, num_pdfs, + t * num_sequences_, num_sequences_), + log_prob_deriv(nnet_output_deriv_transposed_, 0, num_pdfs, + t_wrapped * num_sequences_, num_sequences_); + + int32 num_hmm_states = den_graph_.NumStates(), + num_sequences = num_sequences_; + +#if HAVE_CUDA == 1 + if (CuDevice::Instantiate().Enabled()) { + Timer tim; + dim3 dimBlock(std::min(CU1DBLOCK, num_sequences), 1, 1); + dim3 dimGrid(n_blocks(num_sequences, dimBlock.x), num_hmm_states, 1); + while (1) { + if (dimGrid.y > 65535) // the hardware doesn't allow more than this. + dimGrid.y = 65535; + cuda_chain_hmm_backward(dimGrid, dimBlock, forward_transitions, transitions, + num_sequences, num_hmm_states, + probs.Data(), probs.Stride(), + this_alpha_dash, next_beta, this_beta_dash, + log_prob_deriv.Data(), log_prob_deriv.Stride()); + CU_SAFE_CALL(cudaGetLastError()); + if (dimGrid.y == num_hmm_states) { + break; // this is the normal case. + } else { + // We reach this code only in the unusual case where num_hmm_states > + // 65535. We can compute the betas (and log-prob derivatives) for the + // remaining HMM states by moving some of the array pointers and making + // the call again. + forward_transitions += dimGrid.y; + this_alpha_dash += dimGrid.y * num_sequences; + this_beta_dash += dimGrid.y * num_sequences; + num_hmm_states -= dimGrid.y; + dimGrid.y = num_hmm_states; + } + } + CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); + } else +#endif + { + int32 prob_stride = probs.Stride(), + deriv_stride = log_prob_deriv.Stride(); + const BaseFloat *prob_data = probs.Data(); + BaseFloat *log_prob_deriv_data = log_prob_deriv.Data(); + for (int32 h = 0; h < num_hmm_states; h++) { + for (int32 s = 0; s < num_sequences; s++) { + BaseFloat this_alpha_dash_prob = this_alpha_dash[h * num_sequences + s], + inv_arbitrary_scale = + this_alpha_dash[num_hmm_states * num_sequences + s]; + double tot_variable_factor = 0.0; + BaseFloat occupation_factor = this_alpha_dash_prob / + inv_arbitrary_scale; + const DenominatorGraphTransition + *trans_iter = transitions + forward_transitions[h].first, + *trans_end = transitions + forward_transitions[h].second; + for (; trans_iter != trans_end; ++trans_iter) { + BaseFloat transition_prob = trans_iter->transition_prob; + int32 pdf_id = trans_iter->pdf_id, + next_hmm_state = trans_iter->hmm_state; + BaseFloat variable_factor = transition_prob * + next_beta[next_hmm_state * num_sequences + s] * + prob_data[pdf_id * prob_stride + s]; + tot_variable_factor += variable_factor; + BaseFloat occupation_prob = variable_factor * occupation_factor; + log_prob_deriv_data[pdf_id * deriv_stride + s] += occupation_prob; + } + this_beta_dash[h * num_sequences + s] = + tot_variable_factor / inv_arbitrary_scale; + } + } + } +} + +void DenominatorComputation::BetaGeneralFrameDebug(int32 t) { + BaseFloat num_hmm_states = den_graph_.NumStates(), + alpha_beta_size = num_hmm_states * num_sequences_; + CuSubVector this_alpha_dash(alpha_.RowData(t), alpha_beta_size), + this_beta_dash(beta_.RowData(t % 2), alpha_beta_size); + int32 t_wrapped = t % static_cast(kMaxDerivTimeSteps), + num_pdfs = exp_nnet_output_transposed_.NumRows(); + CuSubMatrix this_log_prob_deriv( + nnet_output_deriv_transposed_, 0, num_pdfs, + t_wrapped * num_sequences_, num_sequences_); + BaseFloat alpha_beta_product = VecVec(this_alpha_dash, + this_beta_dash), + this_log_prob_deriv_sum = this_log_prob_deriv.Sum(); + if (!ApproxEqual(alpha_beta_product, num_sequences_)) { + KALDI_WARN << "On time " << t << ", alpha-beta product " + << alpha_beta_product << " != " << num_sequences_ + << " alpha-dash-sum = " << this_alpha_dash.Sum() + << ", beta-dash-sum = " << this_beta_dash.Sum(); + if (fabs(alpha_beta_product - num_sequences_) > 2.0) { + KALDI_WARN << "Excessive error detected, will abandon this minibatch"; + ok_ = false; + } + } + // use higher tolerance, since we are using randomized pruning for the + // log-prob derivatives. + if (!ApproxEqual(this_log_prob_deriv_sum, + num_sequences_, 0.01)) { + KALDI_WARN << "On time " << t << ", log-prob-deriv sum " + << this_log_prob_deriv_sum << " != " << num_sequences_; + if (fabs(this_log_prob_deriv_sum - num_sequences_) > 2.0) { + KALDI_WARN << "Excessive error detected, will abandon this minibatch"; + ok_ = false; + } + } +} + + +} // namespace chain +} // namespace kaldi diff --git a/src/chain/chain-denominator.h b/src/chain/chain-denominator.h new file mode 100644 index 00000000000..2da47a03c51 --- /dev/null +++ b/src/chain/chain-denominator.h @@ -0,0 +1,316 @@ +// chain/chain-denominator.h + +// Copyright 2015 Johns Hopkins University (Author: Daniel Povey) + + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + + +#ifndef KALDI_CHAIN_CHAIN_DENOMINATOR_H_ +#define KALDI_CHAIN_CHAIN_DENOMINATOR_H_ + +#include +#include + +#include "base/kaldi-common.h" +#include "util/common-utils.h" +#include "fstext/fstext-lib.h" +#include "tree/context-dep.h" +#include "lat/kaldi-lattice.h" +#include "matrix/kaldi-matrix.h" +#include "hmm/transition-model.h" +#include "cudamatrix/cu-matrix.h" +#include "cudamatrix/cu-array.h" +#include "chain/chain-den-graph.h" +#include "chain/chain-training.h" + +namespace kaldi { +namespace chain { + + +/* + This extended comment describes how we implement forward-backward without log + and without overflow, and also the leaky-HMM idea. + + We'll start by establishing the notation for conventional forward-backward, + then add the 'arbitrary-scale' concept that prevents overflow, and then + add the 'leaky-hmm' concept. + + All this is done in parallel over multiple sequences, but the computations + are independent over the separate sequences, so we won't introduce any notation + or index for the sequence; we'll just explain it for one sequences. + + Suppose we have I hmm-states, numbered i = 0 ... I-1 (we'll use i and j for + hmm-state indexes). Let foll(i) give a list of arcs leaving state i, and + pred(i) give a list of arcs entering state i, and we'll use notation like: + for (j, p, n) in foll(i): + for iterating over those arcs, where in this case j is the destination-state, + p is the transition-probability of the arc and n is the pdf-id index. + We can then look up the emission probability as x(t, n) for some frame + 0 <= t < T. + + ** Version 1 of the computation (naive version) ** + + * Forward computation (version 1) + + In the forward computation we're computing alpha(i, t) for 0 <= t <= T): + - For the first frame, set alpha(0, i) = init(i), where init(i) is the + initial-probabilitiy from state i. # in our framework these are obtained + # by running the HMM for a while and getting an averaged occupation + # probability, and using this as an initial-prob, since the boundaries of + # chunks don't really correspond to utterance boundaries in general.] + - For t = 1 ... T: + for i = 0 ... I-1: + alpha(t, i) = 0 + for (j, p, n) in pred(i): # note: j is preceding-state. + alpha(t, i) += x(t-1, n) * alpha(t-1, j) * p. + + - total-prob = \sum_i alpha(T, i). # note, we take the final-probs of all states + # to be 1.0. + + * Backward computation (version 1) + + And now for the backward computation. Contrary to tradition, we include the + inverse of the total-prob as a factor in the betas. This is both more + convenient (it simplifies the way we obtain posteriors), and makes the + algorithm more generalizable as all the beta quantities can be interpreted as + the partial derivative of the overall logprob with respect to their + corresponding alpha. + + In forward backward notation, gamma is normally used for state-level + occupation probabilities, but what we care about here is pdf-id-level + occupation probabilities (i.e. the partial derivative of the overall logprob + w.r.t. the logs of the x(t, n) quantities), so we use gamma for that. + + - for the final frame: + for each i, beta(T, i) = 1 / total-prob. + - for t = T-1 ... 0: + for i = 0 ... I-1: + beta(t, i) = 0 + for (j, p, n) in foll(i): # note: j is following-state. + beta(t, i) += x(t, n) * beta(t+1, j) * p. + gamma(t, n) += alpha(t, i) * x(t, n) * beta(t+1, j) * p. + + ** Version 2 of the computation (renormalized version) ** + + Version 1 of the algorithm is susceptible to numeric underflow and overflow, + due to the limited range of IEEE floating-point exponents. + Define tot-alpha(t) = \sum_i alpha(t, i). Then the renormalized version of + the computation is as above, except whenever the quantity x(t, n) appears, + we replace it with x(t, n) / tot-alpha(t). In the algorithm we refer to + 1.0 / tot-alpha(t) as 'arbitrary_scale', because mathematically we can use any + value here as long as we are consistent and the value only varies with t + and not with n; we'll always get the same posteriors (gamma). + + When the algorithm outputs log(total-prob) as the total log-probability + of the HMM, we have to instead return the expression: + log(total-prob) + \sum_{t=0}^{T-1} tot-alpha(t). + to correct for the scaling of the x values. + + The algorithm is still vulnerable to overflow in the beta computation because + it's possible that the dominant path could have a very tiny alpha. However, + once we introduce the leaky-HMM idea (below), this problem will disappear. + + ** Version 3 of the computation (leaky-HMM version) ** + + The leaky-HMM idea is intended to improve generalization by allowing paths + other than those explicitly allowed by the FST we compiled. Another way to + look at it is as a way of hedging our bets about where we split the utterance, + so it's as we're marginalizing over different splits of the utterance. You + could also think of it as a modification of the FST so that there is an + epsilon transition from each state to a newly added state, with probability + one, and then an epsilon transition from the newly added state to each state + with probability leaky-hmm-prob * init(i) [except we need a mechanism so that + no more than two epsilon transitions can be taken per frame- this would involve + creating two copies of the states] + + Recall that we mentioned that init(i) is the initial-probability of + HMM-state i, but these are obtained in such a way that they can be treated + as priors, or average occupation-probabilities. + + Anyway, the way we formulate leaky-hmm is as follows: + + * Forward computation (version 3) + + Let leaky-hmm-prob be a constant defined by the user, with 0.1 being a typical + value. It defines how much probability we give to the 'leaky' transitions. + + - For frame 0, set alpha(0, i) = init(i). + - For 0 <= t <= T, define tot-alpha(t) = \sum_i alpha(t, i). + - For 0 <= t <= T, define alpha'(t, i) = alpha(t, i) + tot-alpha(t) * leaky-hmm-prob * init(i). + + - For 1 <= t <= T, the computation of alpha(t, i) is as before except we use + the previous frame's alpha' instead of alpha. That is: + alpha(t, i) = 0 + for (j, p, n) in pred(i): # note: j is preceding-state. + alpha(t, i) += alpha'(t-1, j) * p * x(t-1, n) / tot-alpha(t-1) + + - total-prob = \sum_i alpha'(T, i) + + The corrected log-prob that we return from the algorithm will be + (total-prob + \sum_{t=0}^{T-1} tot-alpha(t)). + + * Backward computation (version 3) + + The backward computation is as follows. It is fairly straightforward to + derive if you think of it as an instance of backprop where beta, tot-beta and + beta' are the partial derivatives of the output log-prob w.r.t. the + corresponding alpha, tot-alpha and alpha' quantities. Note, tot-beta is not + really the sum of the betas as its name might suggest, it's just the + derivative w.r.t. tot-alpha. + + - beta'(T, i) = 1 / total-prob. + - for 0 <= t <= T, define tot-beta(t) = leaky-hmm-prob * \sum_i init(i) * beta'(t, i) + - for 0 <= t <= T, define beta(t, i) = beta'(t, i) + tot-beta(t). + - for 0 <= t < T, we compute beta'(t, i) and update gamma(t, n) as follows: + for 0 <= i < I: + beta'(t, i) = 0 + for (j, p, n) in foll(i): # note: j is following-state. + beta'(t, i) += beta(t+1, j) * p * x(t, n) / tot-alpha(t) + gamma(t, n) += alpha'(t, i) * beta(t+1, j) * p * x(t, n) / tot-alpha(t) + + Note: in the code, the tot-alpha and tot-beta quantities go in the same + memory location that the corresponding alpha and beta for state I would go. + + */ + + +// This does forward-backward in parallel on a number of sequences, using a +// single HMM. +class DenominatorComputation { + public: + /* + Constructor. 'nnet_output' is the raw nnet output (which we'll treat as + pseudo-log-likelihoods). + + @param [in] opts The options. + @param [in] graph The HMM that we use for the denominator (like a decoding graph, + with pdf-ids on the transitions). + @param [in] num_sequences The number of separate time sequences (all of the same length) + that we are working with. Must divide nnet_output.NumRows(). + @param [in] nnet_output The output of the neural network for this minibatch. + The rows must be ordered as (first frame of all sequences) + (second frame of all sequences), etc. + */ + DenominatorComputation(const ChainTrainingOptions &opts, + const DenominatorGraph &den_graph, + int32 num_sequences, + const CuMatrixBase &nnet_output); + + // Does the forward computation, and returns the total negated log-like summed + // over all sequences. You will have to scale this by any supervision + // weighting factor, manually. + BaseFloat Forward(); + + // this adds deriv_weight times (the derivative of the log-prob w.r.t. the + // nnet output), to 'nnet_output_deriv'. + // returns true if everything seemed OK, false if a failure was detected. + bool Backward(BaseFloat deriv_weight, + CuMatrixBase *nnet_output_deriv); + + private: + // Defining this constant as an enum is easier. it controls a memory/speed + // tradeoff, determining how many frames' worth of the transposed derivative + // we store at a time. It's not very critical; the only disadvantage from + // setting it small is that we have to invoke an AddMat kernel more times. + enum { kMaxDerivTimeSteps = 8 }; + + // sets up the alpha for frame t = 0. + void AlphaFirstFrame(); + // the alpha computation for some 0 < t <= num_time_steps_. + void AlphaGeneralFrame(int32 t); + // does the 'alpha-dash' computation for time t. this relates to + // 'leaky hmm'. + void AlphaDash(int32 t); + + // done after all the alphas, this function computes and returns the total + // log-likelihood summed over all the sequences, and sets tot_prob_ (if we're + // doing correction) log_correction_term_. Note, this won't be scaled by + // 'deriv_scale' (which of course we haven't seen by the time this is called, + // from the Forward() computation). + BaseFloat ComputeTotLogLike(); + + void BetaDashLastFrame(); + // beta computation for 0 <= beta < num_time_steps_. + void BetaDashGeneralFrame(int32 t); + // compute the beta quantity from the beta-dash quantity (relates to leaky hmm). + void Beta(int32 t); + + // some checking that we can do if debug mode is activated, or on frame zero. + // Sets ok_ to false if a bad problem is detected. + void BetaGeneralFrameDebug(int32 t); + + const ChainTrainingOptions &opts_; + const DenominatorGraph &den_graph_; + + // number of separate frame sequences + int32 num_sequences_; + // number of frames per sequence. nnet_output_.NumRows() equals + // num_sequences_ * frames_per_sequence. + int32 frames_per_sequence_; + + // The transpose of the exp() of the nnet output (the transpose is more + // convenient for memory locality, and the exp() avoids us having to + // exponentiate in the forward-backward). + // + // The row-index is the pdf-id; and the column index equals (frame_index * + // num_sequences + sequence_index). + CuMatrix exp_nnet_output_transposed_; + + // the derivs w.r.t. the nnet outputs (transposed) + CuMatrix nnet_output_deriv_transposed_; + + // the (temporarily) alpha and (more permanently) alpha-dash probabilities; + // dimension is (frames_per_sequence + 1) by (num-hmm-states * num-sequences + + // num_sequences). Note, they are not logs. The last 'num_sequences' + // columns, where the alpha for the state indexed 'num_hmm_states' would live, + // are for the alpha-sums, which relates to leaky HMM. + CuMatrix alpha_; + + // the beta (also beta-dash) probabilities (rolling buffer); dimension is 2 * + // (num-hmm-states * num-sequences + num_sequences). [the last + // 'num_sequences' columns are for the beta-sums, which relates to leaky HMM.] + // Note: for efficiency and to simplify the equations, these are actually the + // beta / tot_prob_. + CuMatrix beta_; + + // the total probability for each sequence, excluding the product of + // correction terms. [the correction terms refer to the fact that we multiply + // on each frame by 1/alpha of hmm-state 0 of the previous frame.]. + // After the correction terms the total probability is fairly close to 1, + // which is why we can store it as non-log. + CuVector tot_prob_; + + // the log of tot_prob_. + CuVector tot_log_prob_; + + // the log of the total correction term for each sequence, which is the + // product of the alpha-sums [used in the leaky-hmm computation] over all the + // frames. The 'correction terms' are terms that we divide the alphas and + // betas by in order to keep them in a good dynamic range. The product of + // them must be included in the total likelihood. + CuVector log_correction_term_; + + bool ok_; +}; + + + +} // namespace chain +} // namespace kaldi + +#endif // KALDI_CHAIN_CHAIN_DENOMINATOR_H_ + diff --git a/src/chain/chain-kernels-ansi.h b/src/chain/chain-kernels-ansi.h new file mode 100644 index 00000000000..388c78ab2ee --- /dev/null +++ b/src/chain/chain-kernels-ansi.h @@ -0,0 +1,56 @@ +// chain/chain-kernels-ansi.h + +// Copyright 2015 Johns Hopkins University (author: Daniel Povey) + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + + +#ifndef KALDI_CHAIN_CHAIN_KERNELS_ANSI_H_ +#define KALDI_CHAIN_CHAIN_KERNELS_ANSI_H_ +#include "chain/chain-datastruct.h" + +#if HAVE_CUDA == 1 +extern "C" { + + void cuda_chain_hmm_backward(dim3 Gr, dim3 Bl, + const Int32Pair *forward_transitions, + const DenominatorGraphTransition *transitions, + int32_cuda num_sequences, + int32_cuda num_hmm_states, + const BaseFloat *probs, + int32_cuda prob_stride, + const BaseFloat *this_alpha, + const BaseFloat *next_beta, + BaseFloat *this_beta, + BaseFloat *log_prob_deriv, + int32_cuda log_prob_deriv_stride); + + void cuda_chain_hmm_forward(dim3 Gr, dim3 Bl, + const Int32Pair *backward_transitions, + const DenominatorGraphTransition *transitions, + int32_cuda num_sequences, + int32_cuda num_hmm_states, + const BaseFloat *probs, + int32_cuda prob_stride, + const BaseFloat *prev_alpha, + BaseFloat *this_alpha); + +} // extern "C" + +#endif // HAVE_CUDA + + +#endif // KALDI_CHAIN_CHAIN_KERNELS_ANSI_H_ diff --git a/src/chain/chain-kernels.cu b/src/chain/chain-kernels.cu new file mode 100644 index 00000000000..8d555ee76cc --- /dev/null +++ b/src/chain/chain-kernels.cu @@ -0,0 +1,279 @@ +// chain/chain-kernels.cu + +// Copyright 2015 Johns Hopkins University (author: Daniel Povey) + + +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + + +#include +#include "chain/chain-kernels-ansi.h" + + + +template +__device__ inline void atomic_add(Real* address, Real value) { + Real old = value; + Real ret = atomicExch(address, 0.0f); + Real new_old = ret + old; + while ((old = atomicExch(address, new_old)) != 0.0f) { + new_old = atomicExch(address, 0.0f); + new_old += old; + } +} + +template<> +__device__ inline void atomic_add(double* address, double val) { + unsigned long long int* address_as_ull = + reinterpret_cast(address); + unsigned long long int old = *address_as_ull, assumed; + do { + assumed = old; + old = atomicCAS(address_as_ull, assumed, + __double_as_longlong(val + __longlong_as_double(assumed))); + } while (assumed != old); +} + +template +__device__ inline void atomic_add_thresholded(Real* address, Real value) { + // This function uses a randomized algorithm to only do atomic adds for values + // >=n a threshold, and if it's below the threshold, randomly add the + // threshold itself with probability (value / threshold). This preserves + // expectations. Note: we assume that value >= 0. + + // kThresholdingPowerOfTwo is defined in chain-datastruct.h; it defines + // the threshold for randomized posterior pruning. + const Real threshold = 1.0 / (1 << kThresholdingPowerOfTwo); + if (value >= threshold) { + atomic_add(address, value); + } else { + // The intention here is to do: + // with probability(value / threshold), do: + // atomic_add(address, threshold); + // We use the least significant bits of the value as a source of + // randomness. It would probably be more efficient to extract these + // random bits directly from the float, but I don't want to have to + // deal with endian-ness issues. + // + // below, x is a fixed-point representation of (value / threshold); it would + // be 16777216 == 2^24 if value == threshold and 0 if value == 0. We choose + // the power 24 because that's the number of binary digits in the mantissa + // in IEEE single precision floating point. + // Note: we parenthesize the expression like this so that the + // denominator can be precomputed as a constant expression. + int32_cuda x = value / (threshold / (1 << 24)); + // in the line below, the expression (x >> 12) is a representation of (value / + // threshold) between 0 and 4096, with 4096 representing (value / threshold == + // 1), while (x & 4095) is treated as a pseudorandom number between 0 and 4095. + if ((x >> 12) > (x & 4095)) + atomic_add(address, threshold); + } +} + +// one iteration of the forward computation in the 'tombstone' CTC HMM computation. +// The grid y determines which HMM-state we handle. [put this in the grid because +// HMM-states don't all take the same amount of time in the backwards direction, and it's +// better for scheduling to have them at the outer level.] +// The block x and grid x determine which sequence (0 ... num_sequences - 1) we handle; +// note that num_sequences == the number of elements in the minibatch, and we +// insist they all have the same number of time steps. +// note: 'probs' is indexed by sequence-index + (pdf-index * prob_stride). +__global__ +static void _cuda_chain_hmm_forward(const Int32Pair *backward_transitions, + const DenominatorGraphTransition *transitions, + int32_cuda num_sequences, + int32_cuda num_hmm_states, + const BaseFloat *probs, + int32_cuda prob_stride, + const BaseFloat *prev_alpha, + BaseFloat *this_alpha) { + // 'backward_transitions', indexed by hmm-state, consists of [start, end] + // indexes into the 'transitions' array. This gives us the info for + // transitions *into* this state. 'probs' contains the exponentiated neural + // net outputs; it has dimension num-output-indexes by num_sequences and its + // stride is 'prob_stride'. 'prev_alpha' and 'this_alpha', which are + // extracted from a larger matrix, both have dimension num-history-states by + // num-sequences. + + // s is the index of the sequence within the minibatch, + // from 0 .. num-egs-in-this-minibatch - 1. + // h is the hmm-state index. + int32_cuda s = threadIdx.x + blockIdx.x * blockDim.x, + h = blockIdx.y; + if (s >= num_sequences) + return; + + double this_tot_alpha = 0.0; + const DenominatorGraphTransition + *trans_iter = transitions + backward_transitions[h].first, + *trans_end = transitions + backward_transitions[h].second; + // Note: regarding this loop unrolling, I tried the automatic unrolling using + // #pragma unroll 2 (after modifying the loop to have an integer index), but I + // did not see any performance improvement, it was slightly slower. So the + // compiler must be doing something different than what I'm doing here. + const int loop_unroll = 2; // don't change this without changing the code + // below. + for (; trans_iter + loop_unroll <= trans_end; trans_iter += loop_unroll) { + BaseFloat transition_prob0 = trans_iter[0].transition_prob; + int32_cuda pdf_id0 = trans_iter[0].pdf_id, + prev_hmm_state0 = trans_iter[0].hmm_state; + BaseFloat transition_prob1 = trans_iter[1].transition_prob; + int32_cuda pdf_id1 = trans_iter[1].pdf_id, + prev_hmm_state1 = trans_iter[1].hmm_state; + BaseFloat pseudo_loglike0 = probs[pdf_id0 * prob_stride + s], + this_prev_alpha0 = prev_alpha[prev_hmm_state0 * num_sequences + s], + pseudo_loglike1 = probs[pdf_id1 * prob_stride + s], + this_prev_alpha1 = prev_alpha[prev_hmm_state1 * num_sequences + s]; + + this_tot_alpha += this_prev_alpha0 * transition_prob0 * pseudo_loglike0 + + this_prev_alpha1 * transition_prob1 * pseudo_loglike1; + } + if (trans_iter != trans_end) { + // mop up the odd transition. + BaseFloat transition_prob0 = trans_iter[0].transition_prob; + int32_cuda pdf_id0 = trans_iter[0].pdf_id, + prev_hmm_state0 = trans_iter[0].hmm_state; + BaseFloat pseudo_loglike0 = probs[pdf_id0 * prob_stride + s], + this_prev_alpha0 = prev_alpha[prev_hmm_state0 * num_sequences + s]; + this_tot_alpha += this_prev_alpha0 * transition_prob0 * pseudo_loglike0; + } + + // Let arbitrary_scale be the inverse of the sum of all alpha values on-- the + // previous frame this sum of all the alpha values is stored in the place that + // we'd store the previous alpha for state-index equal to num_hmm_states + // (i.e. one past the end). We multiply this into all the + // transition-probabilities from the previous frame to this frame, in both the + // forward and backward passes, in order to keep the alphas in a good numeric + // range. This won't affect the posteriors, as it's just a constant factor + // for each frame, but when computing the total likelihood we'll need to + // compensate for it later on. + BaseFloat arbitrary_scale = + 1.0 / prev_alpha[num_hmm_states * num_sequences + s]; + this_alpha[h * num_sequences + s] = this_tot_alpha * arbitrary_scale; +} + + +__global__ +static void _cuda_chain_hmm_backward(const Int32Pair *forward_transitions, + const DenominatorGraphTransition *transitions, + int32_cuda num_sequences, int32_cuda num_hmm_states, + const BaseFloat *probs, int32_cuda prob_stride, + const BaseFloat *this_alpha, const BaseFloat *next_beta, + BaseFloat *this_beta, BaseFloat *log_prob_deriv, + int32_cuda log_prob_deriv_stride) { + // 'forward_transitions', indexed by hmm-state, consists of [start, end] + // indexes into the 'transition_info' array. This is about the transitions + // *out of* this state. 'probs' contains the exponentiated neural net + // outputs; it has dimension num-output-indexes by num_sequences, and contains + // just the observation probabilities for this time index. Its stride is + // prob_stride. + // 'this_alpha', 'next_beta' and 'this_beta' all have dimension + // num-history-states by num-sequences. + // The beta probs are normalized in such a way (by multiplying by 1/(total-data-prob)) + // that to get occupation counts we don't need to multiply by 1/total-data-prob. + // deriv_scale is a factor (e.g. -1.0 or -0.99) that we multiply these derivs by + // while accumulating them. + + // s is the index of the sequence within the minibatch, + // from 0 .. num-egs-in-this-minibatch - 1. + // h is the hmm-state index. + int32_cuda s = threadIdx.x + blockIdx.x * blockDim.x, + h = blockIdx.y; + if (s >= num_sequences) + return; + + // See where arbitrary_scale is defined in the forward computation above, for + // more explanation of inv_arbitrary_scale. + BaseFloat this_alpha_prob = this_alpha[h * num_sequences + s], + inv_arbitrary_scale = + this_alpha[num_hmm_states * num_sequences + s]; + double tot_variable_factor = 0.0; + + BaseFloat occupation_factor = this_alpha_prob / inv_arbitrary_scale; + const DenominatorGraphTransition + *trans_iter = transitions + forward_transitions[h].first, + *trans_end = transitions + forward_transitions[h].second; + const int loop_unroll = 2; // don't change this without changing the code + // below. + for (; trans_iter + loop_unroll <= trans_end; trans_iter += loop_unroll) { + BaseFloat transition_prob0 = trans_iter[0].transition_prob; + int32_cuda pdf_id0 = trans_iter[0].pdf_id, + next_hmm_state0 = trans_iter[0].hmm_state; + BaseFloat transition_prob1 = trans_iter[1].transition_prob; + int32_cuda pdf_id1 = trans_iter[1].pdf_id, + next_hmm_state1 = trans_iter[1].hmm_state; + BaseFloat variable_factor0 = transition_prob0 * + next_beta[next_hmm_state0 * num_sequences + s] * + probs[pdf_id0 * prob_stride + s], + variable_factor1 = transition_prob1 * + next_beta[next_hmm_state1 * num_sequences + s] * + probs[pdf_id1 * prob_stride + s]; + tot_variable_factor += variable_factor0 + variable_factor1; + BaseFloat occupation_prob0 = variable_factor0 * occupation_factor; + atomic_add_thresholded(log_prob_deriv + (pdf_id0 * log_prob_deriv_stride + s), + occupation_prob0); + BaseFloat occupation_prob1 = variable_factor1 * occupation_factor; + atomic_add_thresholded(log_prob_deriv + (pdf_id1 * log_prob_deriv_stride + s), + occupation_prob1); + } + if (trans_iter != trans_end) { + // mop up the odd transition. + BaseFloat transition_prob0 = trans_iter[0].transition_prob; + int32_cuda pdf_id0 = trans_iter[0].pdf_id, + next_hmm_state0 = trans_iter[0].hmm_state; + BaseFloat variable_factor0 = transition_prob0 * + next_beta[next_hmm_state0 * num_sequences + s] * + probs[pdf_id0 * prob_stride + s]; + tot_variable_factor += variable_factor0; + BaseFloat occupation_prob0 = variable_factor0 * occupation_factor; + atomic_add_thresholded(log_prob_deriv + (pdf_id0 * log_prob_deriv_stride + s), + occupation_prob0); + } + BaseFloat beta = tot_variable_factor / inv_arbitrary_scale; + this_beta[h * num_sequences + s] = beta; +} + + +void cuda_chain_hmm_forward(dim3 Gr, dim3 Bl, + const Int32Pair *backward_transitions, + const DenominatorGraphTransition *transitions, + int32_cuda num_sequences, + int32_cuda num_hmm_states, + const BaseFloat *probs, int32_cuda prob_stride, + const BaseFloat *prev_alpha, + BaseFloat *this_alpha) { + _cuda_chain_hmm_forward<<>>(backward_transitions, transitions, + num_sequences, num_hmm_states, + probs, prob_stride, + prev_alpha, this_alpha); +} + +void cuda_chain_hmm_backward(dim3 Gr, dim3 Bl, + const Int32Pair *forward_transitions, + const DenominatorGraphTransition *transitions, + int32_cuda num_sequences, + int32_cuda num_hmm_states, + const BaseFloat *probs, int32_cuda prob_stride, + const BaseFloat *this_alpha, const BaseFloat *next_beta, + BaseFloat *this_beta, + BaseFloat *log_prob_deriv, + int32_cuda log_prob_deriv_stride) { + _cuda_chain_hmm_backward<<>>(forward_transitions, transitions, + num_sequences, num_hmm_states, + probs, prob_stride, + this_alpha, next_beta, + this_beta, log_prob_deriv, + log_prob_deriv_stride); +} + diff --git a/src/chain/chain-numerator.cc b/src/chain/chain-numerator.cc new file mode 100644 index 00000000000..139d28bdd77 --- /dev/null +++ b/src/chain/chain-numerator.cc @@ -0,0 +1,213 @@ +// chain/chain-numerator.cc + +// Copyright 2015 Johns Hopkins University (author: Daniel Povey) + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + + +#include "chain/chain-numerator.h" +#include "cudamatrix/cu-vector.h" + +namespace kaldi { +namespace chain { + + +NumeratorComputation::NumeratorComputation( + const Supervision &supervision, + const CuMatrixBase &nnet_output): + supervision_(supervision), + nnet_output_(nnet_output) { + ComputeFstStateTimes(supervision_.fst, &fst_state_times_); + KALDI_ASSERT(supervision.num_sequences * supervision.frames_per_sequence == + nnet_output.NumRows() && + supervision.label_dim == nnet_output.NumCols()); +} + + +void NumeratorComputation::ComputeLookupIndexes() { + + int32 num_states = supervision_.fst.NumStates(); + int32 num_arcs_guess = num_states * 2; + fst_output_indexes_.reserve(num_arcs_guess); + + int32 frames_per_sequence = supervision_.frames_per_sequence, + num_sequences = supervision_.num_sequences, + cur_time = 0; + + // the following is a CPU version of nnet_output_indexes_. It is a list of + // pairs (row-index, column-index) which index nnet_output_. The row-index + // corresponds to the time-frame 't', and the column-index the pdf-id, but we + // have to be a little careful with the row-index because there is a + // reordering that happens if supervision_.num_sequences > 1. + // + + // output-index) and denominator_indexes_cpu is a list of pairs (c, + // history-state-index). + std::vector nnet_output_indexes_cpu; + + // index_map_this_frame is a map, only valid for t == cur_time, + // from the pdf-id to the index into nnet_output_indexes_cpu for the + // likelihood at (cur_time, pdf-id). + unordered_map index_map_this_frame; + + typedef unordered_map::iterator IterType; + + for (int32 state = 0; state < num_states; state++) { + int32 t = fst_state_times_[state]; + if (t != cur_time) { + KALDI_ASSERT(t == cur_time + 1); + index_map_this_frame.clear(); + cur_time = t; + } + for (fst::ArcIterator aiter(supervision_.fst, state); + !aiter.Done(); aiter.Next()) { + int32 pdf_id = aiter.Value().ilabel - 1; + KALDI_ASSERT(pdf_id >= 0 && pdf_id < supervision_.label_dim); + + int32 index = nnet_output_indexes_cpu.size(); + + // the next few lines are a more efficient way of doing the following: + // if (index_map_this_frame.count(pdf_id) == 0) { + // index = index_map_this_frame[pdf_id] = nnet_output_indexes_cpu.size(); + // nnet_output_indexes_cpu.push_back(pair(pdf_id, row-index)); + // } else { + // index = index_map_this_frame[pdf_id]; + // } + std::pair p = index_map_this_frame.insert( + std::pair(pdf_id, index)); + if (p.second) { // Was inserted -> map had no key 'output_index' + Int32Pair pair; // we can't use constructors as this was declared in C. + pair.first = ComputeRowIndex(t, frames_per_sequence, num_sequences); + pair.second = pdf_id; + nnet_output_indexes_cpu.push_back(pair); + } else { // was not inserted -> set 'index' to the existing index. + index = p.first->second; + } + fst_output_indexes_.push_back(index); + } + } + nnet_output_indexes_ = nnet_output_indexes_cpu; + KALDI_ASSERT(!fst_output_indexes_.empty()); +} + +BaseFloat NumeratorComputation::Forward() { + ComputeLookupIndexes(); + nnet_logprobs_.Resize(nnet_output_indexes_.Dim(), kUndefined); + nnet_output_.Lookup(nnet_output_indexes_, nnet_logprobs_.Data()); + const fst::StdVectorFst &fst = supervision_.fst; + KALDI_ASSERT(fst.Start() == 0); + int32 num_states = fst.NumStates(); + log_alpha_.Resize(num_states, kUndefined); + log_alpha_.Set(-std::numeric_limits::infinity()); + tot_log_prob_ = -std::numeric_limits::infinity(); + + log_alpha_(0) = 0.0; // note, state zero is the start state, we checked above + + const BaseFloat *nnet_logprob_data = nnet_logprobs_.Data(); + std::vector::const_iterator fst_output_indexes_iter = + fst_output_indexes_.begin(); + + double *log_alpha_data = log_alpha_.Data(); + + for (int32 state = 0; state < num_states; state++) { + double this_log_alpha = log_alpha_data[state]; + for (fst::ArcIterator aiter(fst, state); !aiter.Done(); + aiter.Next(), ++fst_output_indexes_iter) { + const fst::StdArc &arc = aiter.Value(); + int32 nextstate = arc.nextstate; + BaseFloat transition_logprob = -arc.weight.Value(); + int32 index = *fst_output_indexes_iter; + BaseFloat pseudo_loglike = nnet_logprob_data[index]; + double &next_log_alpha = log_alpha_data[nextstate]; + next_log_alpha = LogAdd(next_log_alpha, pseudo_loglike + + transition_logprob + this_log_alpha); + } + if (fst.Final(state) != fst::TropicalWeight::Zero()) { + BaseFloat final_logprob = -fst.Final(state).Value(); + tot_log_prob_ = LogAdd(tot_log_prob_, + this_log_alpha + final_logprob); + } + } + KALDI_ASSERT(fst_output_indexes_iter == + fst_output_indexes_.end()); + return tot_log_prob_ * supervision_.weight; +} + + +void NumeratorComputation::Backward( + CuMatrixBase *nnet_output_deriv) { + const fst::StdVectorFst &fst = supervision_.fst; + int32 num_states = fst.NumStates(); + log_beta_.Resize(num_states, kUndefined); + nnet_logprob_derivs_.Resize(nnet_logprobs_.Dim()); + + // we'll be counting backwards and moving the 'fst_output_indexes_iter' + // pointer back. + const int32 *fst_output_indexes_iter = &(fst_output_indexes_[0]) + + fst_output_indexes_.size(); + const BaseFloat *nnet_logprob_data = nnet_logprobs_.Data(); + double tot_log_prob = tot_log_prob_; + double *log_beta_data = log_beta_.Data(); + const double *log_alpha_data = log_alpha_.Data(); + BaseFloat *nnet_logprob_deriv_data = nnet_logprob_derivs_.Data(); + + for (int32 state = num_states - 1; state >= 0; state--) { + int32 this_num_arcs = fst.NumArcs(state); + // on the backward pass we access the fst_output_indexes_ vector in a zigzag + // pattern. + fst_output_indexes_iter -= this_num_arcs; + const int32 *this_fst_output_indexes_iter = fst_output_indexes_iter; + double this_log_beta = -fst.Final(state).Value(); + double this_log_alpha = log_alpha_data[state]; + for (fst::ArcIterator aiter(fst, state); !aiter.Done(); + aiter.Next(), this_fst_output_indexes_iter++) { + const fst::StdArc &arc = aiter.Value(); + double next_log_beta = log_beta_data[arc.nextstate]; + BaseFloat transition_logprob = -arc.weight.Value(); + int32 index = *this_fst_output_indexes_iter; + BaseFloat pseudo_loglike = nnet_logprob_data[index]; + this_log_beta = LogAdd(this_log_beta, pseudo_loglike + + transition_logprob + next_log_beta); + BaseFloat occupation_logprob = this_log_alpha + pseudo_loglike + + transition_logprob + next_log_beta - tot_log_prob, + occupation_prob = exp(occupation_logprob); + nnet_logprob_deriv_data[index] += occupation_prob; + } + // check for -inf. + KALDI_PARANOID_ASSERT(this_log_beta - this_log_beta == 0); + log_beta_data[state] = this_log_beta; + } + KALDI_ASSERT(fst_output_indexes_iter == &(fst_output_indexes_[0])); + + int32 start_state = 0; // the fact that the start state is numbered 0 is + // implied by other properties of the FST + // (epsilon-free-ness and topological sorting, and + // connectedness). + double tot_log_prob_backward = log_beta_(start_state); + if (!ApproxEqual(tot_log_prob_backward, tot_log_prob_)) + KALDI_WARN << "Disagreement in forward/backward log-probs: " + << tot_log_prob_backward << " vs. " << tot_log_prob_; + + // copy this data to GPU. + CuVector nnet_logprob_deriv_cuda; + nnet_logprob_deriv_cuda.Swap(&nnet_logprob_derivs_); + nnet_output_deriv->AddElements(supervision_.weight, nnet_output_indexes_, + nnet_logprob_deriv_cuda.Data()); +} + + +} // namespace chain +} // namespace kaldi diff --git a/src/chain/chain-numerator.h b/src/chain/chain-numerator.h new file mode 100644 index 00000000000..15cb31e0571 --- /dev/null +++ b/src/chain/chain-numerator.h @@ -0,0 +1,146 @@ +// chain/chain-numerator.h + +// Copyright 2015 Johns Hopkins University (Author: Daniel Povey) + + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + + +#ifndef KALDI_CHAIN_CHAIN_NUMERATOR_H_ +#define KALDI_CHAIN_CHAIN_NUMERATOR_H_ + +#include +#include + +#include "base/kaldi-common.h" +#include "util/common-utils.h" +#include "fstext/fstext-lib.h" +#include "tree/context-dep.h" +#include "lat/kaldi-lattice.h" +#include "matrix/kaldi-matrix.h" +#include "hmm/transition-model.h" +#include "chain/chain-supervision.h" +#include "cudamatrix/cu-matrix.h" +#include "cudamatrix/cu-array.h" + +namespace kaldi { +namespace chain { + + +// This class is responsible for the forward-backward of the 'supervision' +// (numerator) FST. +// +// note: the supervision.weight is ignored by this class, you have to apply +// it externally. +// Because the supervision FSTs are quite skinny, i.e. have very few paths for +// each frame, it's feasible to do this computation on the CPU, and that's what +// we do. We transfer from/to the GPU only the things that we need. + +class NumeratorComputation { + + public: + + /// Initialize the objcect. Note: we expect the 'nnet_output' to have the + /// same number of rows as supervision.num_frames * supervision.num_sequences, + /// and the same number of columns as the 'label-dim' of the supervision + /// object (which will be the NumPdfs() of the transition model); but the + /// ordering of the rows of 'nnet_output' is not the same as the ordering of + /// frames in paths in the 'supervision' object (which has all frames of the + /// 1st sequence first, then the 2nd sequence, and so on). Instead, the + /// frames in 'nnet_output' are ordered as: first the first frame of each + /// sequence, then the second frame of each sequence, and so on. This is more + /// convenient both because the nnet3 code internally orders them that way, + /// and because this makes it easier to order things in the way that class + /// SingleHmmForwardBackward needs (we can just transpose, instead of doing a + /// 3d tensor rearrangement). + NumeratorComputation(const Supervision &supervision, + const CuMatrixBase &nnet_output); + + // TODO: we could enable a Viterbi mode. + + // Does the forward computation. Returns the total log-prob multiplied + // by supervision_.weight. + BaseFloat Forward(); + + // Does the backward computation and (efficiently) adds the derivative of the + // nnet output w.r.t. the (log-prob times supervision_.weight times + // deriv_weight) to 'nnet_output_deriv'. + void Backward(CuMatrixBase *nnet_output_deriv); + + private: + + const Supervision &supervision_; + + // state times of supervision_.fst. + std::vector fst_state_times_; + + + // the exp of the neural net output. + const CuMatrixBase &nnet_output_; + + + // 'fst_output_indexes' contains an entry for each arc in the supervision FST, in + // the order you'd get them if you visit each arc of each state in order. + // the contents of fst_output_indexes_ are indexes into nnet_output_indexes_ + // and nnet_logprobs_. + std::vector fst_output_indexes_; + + // nnet_output_indexes is a list of (row, column) indexes that we need to look + // up in nnet_output_ for the forward-backward computation. The order is + // arbitrary, but indexes into this vector appear in fst_output_indexes; + // and it's important that each pair only appear once (in order for the + // derivatives to be summed properly). + CuArray nnet_output_indexes_; + + // the log-probs obtained from lookup in the nnet output, on the CPU. This + // vector has the same size as nnet_output_indexes_. In the backward + // computation, the storage is re-used for derivatives. + Vector nnet_logprobs_; + + // derivatives w.r.t. the nnet logprobs. These can be interpreted as + // occupation probabilities. + Vector nnet_logprob_derivs_; + + // The log-alpha value (forward probability) for each state in the lattices. + Vector log_alpha_; + + // The total pseudo-log-likelihood from the forward-backward. + double tot_log_prob_; + + // The log-beta value (backward probability) for each state in the lattice + Vector log_beta_; + + // This function creates fst_output_indexes_ and nnet_output_indexes_. + void ComputeLookupIndexes(); + + // convert time-index in the FST to a row-index in the nnet-output (to account + // for the fact that the sequences are interleaved in the nnet-output). + inline int32 ComputeRowIndex(int32 t, int32 frames_per_sequence, + int32 num_sequences) { + return t / frames_per_sequence + + num_sequences * (t % frames_per_sequence); + } + +}; + + + + +} // namespace chain +} // namespace kaldi + +#endif // KALDI_CHAIN_CHAIN_NUMERATOR_H_ + diff --git a/src/chain/chain-supervision-test.cc b/src/chain/chain-supervision-test.cc new file mode 100644 index 00000000000..d4b891db06e --- /dev/null +++ b/src/chain/chain-supervision-test.cc @@ -0,0 +1,626 @@ +// chain/chain-supervision-test.cc + +// Copyright 2015 Johns Hopkins University (author: Daniel Povey) + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + +#include "chain/chain-supervision.h" +#include "chain/chain-numerator.h" +#include "fstext/fstext-lib.h" +#include "cudamatrix/cu-device.h" +#include "cudamatrix/cu-vector.h" +#include "hmm/hmm-test-utils.h" +#include "chain/chain-den-graph.h" +#include "chain/chain-denominator.h" +#include "hmm/hmm-utils.h" + + + +namespace kaldi { +namespace chain { + +// computes a phone language-model FST, which has only monophone context. +void ComputeExamplePhoneLanguageModel(const std::vector &phones, + fst::StdVectorFst *g_fst) { + + g_fst->DeleteStates(); + int32 state = g_fst->AddState(); + g_fst->SetStart(state); + + Vector probs(phones.size() + 1); + probs.SetRandn(); + probs.ApplyPow(2.0); + probs.Add(0.01); + probs.Scale(1.0 / probs.Sum()); + + for (size_t i = 0; i < phones.size(); i++) { + int32 phone = phones[i]; + fst::StdArc arc(phone, phone, + fst::TropicalWeight(-log(probs(i))), state); + g_fst->AddArc(state, arc); + } + g_fst->SetFinal(state, fst::TropicalWeight(-log(probs(phones.size())))); +} + + +void ComputeExampleDenFst(const ContextDependency &ctx_dep, + const TransitionModel &trans_model, + fst::StdVectorFst *den_graph) { + using fst::StdVectorFst; + using fst::StdArc; + StdVectorFst phone_lm; + ComputeExamplePhoneLanguageModel(trans_model.GetPhones(), &phone_lm); + + CreateDenominatorFst(ctx_dep, trans_model, phone_lm, den_graph); +} + + +void TestSupervisionIo(const Supervision &supervision) { + bool binary = (RandInt(0, 1) == 0); + std::ostringstream os; + supervision.Write(os, binary); + std::istringstream is(os.str()); + Supervision supervision2; + if (RandInt(0, 1) == 0) + supervision2 = supervision; // test reading already-existing object. + supervision2.Read(is, binary); + std::ostringstream os2; + supervision2.Write(os2, binary); + KALDI_ASSERT(os.str() == os2.str()); + if (binary) { + KALDI_ASSERT(supervision == supervision2); + } + // also test swap and constructor + Supervision supervision3(supervision), supervision4; + supervision3.Swap(&supervision4); + KALDI_ASSERT(supervision == supervision4); +} + +void TestSupervisionNumerator(const Supervision &supervision) { + + CuMatrix nnet_output(supervision.num_sequences * + supervision.frames_per_sequence, + supervision.label_dim); + nnet_output.SetRandn(); + + NumeratorComputation num(supervision, nnet_output); + + // Test that derivs are accurate. + + BaseFloat forward_prob = num.Forward(); + + CuMatrix nnet_output_deriv(nnet_output.NumRows(), + nnet_output.NumCols()); + num.Backward(&nnet_output_deriv); + + int32 dim = 3; + Vector predicted_objf_changes(dim), + observed_objf_changes(dim); + BaseFloat delta = 1.0e-04; + for (int32 p = 0; p < dim; p++) { + CuMatrix new_nnet_output(nnet_output.NumRows(), + nnet_output.NumCols()); + new_nnet_output.SetRandn(); + new_nnet_output.Scale(delta); + predicted_objf_changes(p) = TraceMatMat(nnet_output_deriv, new_nnet_output, + kTrans); + new_nnet_output.AddMat(1.0, nnet_output); + NumeratorComputation num2(supervision, new_nnet_output); + observed_objf_changes(p) = num2.Forward() - forward_prob; + } + KALDI_LOG << "Predicted objf changes are: " + << predicted_objf_changes; + KALDI_LOG << "Observed objf changes are: " + << observed_objf_changes; + + { + BaseFloat correction = (predicted_objf_changes.Sum() - observed_objf_changes.Sum()) / + predicted_objf_changes.Dim(); + observed_objf_changes.Add(correction); + KALDI_LOG << "Correcting observed objf changes for statistical effects, to " + << observed_objf_changes; + KALDI_ASSERT(predicted_objf_changes.ApproxEqual(observed_objf_changes, 0.1)); + } + + + { + CuVector rand(nnet_output.NumRows()); + rand.SetRandn(); + CuMatrix nnet_output_mod(nnet_output); + nnet_output_mod.AddVecToCols(1.0, rand); + NumeratorComputation num_mod(supervision, nnet_output_mod); + BaseFloat forward_prob_mod = num_mod.Forward(); + BaseFloat predicted_change = rand.Sum(), + observed_change = forward_prob_mod - forward_prob; + KALDI_ASSERT(fabs(predicted_change - observed_change) < 0.1); + } + + +} + +void TestSupervisionAppend(const TransitionModel &trans_model, + const Supervision &supervision) { + int32 num_append = RandInt(1,5); + std::vector input(num_append); + for (int32 i = 0; i < num_append; i++) + input[i] = &supervision; + std::vector output; + bool compactify = (RandInt(0, 1) == 0); + AppendSupervision(input, compactify, &output); + if (compactify) { + KALDI_ASSERT(output.size() == 1 && + output[0].frames_per_sequence == + supervision.frames_per_sequence && + output[0].num_sequences == num_append); + } else { + KALDI_ASSERT(output.size() == input.size()); + } + int32 tot_sequences_in = 0, tot_sequences_out = 0, + tot_frames_in = 0, tot_frames_out = 0; + for (int32 i = 0; i < num_append; i++) { + tot_sequences_in += input[i]->num_sequences; + tot_frames_in += input[i]->num_sequences * + input[i]->frames_per_sequence; + } + for (int32 i = 0; i < output.size(); i++) { + tot_sequences_out += output[i].num_sequences; + tot_frames_out += output[i].num_sequences * + output[i].frames_per_sequence; + } + KALDI_ASSERT(tot_sequences_out == tot_sequences_in && + tot_frames_out == tot_frames_in); + + TestSupervisionIo(output[0]); + TestSupervisionNumerator(output[0]); + output[0].Check(trans_model); +} + +void TestSupervisionReattached(const TransitionModel &trans_model, + const Supervision &supervision, + const Supervision &reattached_supervision) { + using namespace fst; + KALDI_LOG << "testing reattached"; + KALDI_ASSERT(reattached_supervision.frames_per_sequence * + reattached_supervision.num_sequences == + supervision.frames_per_sequence * supervision.num_sequences && + reattached_supervision.weight == supervision.weight && + reattached_supervision.label_dim == supervision.label_dim); + UniformArcSelector selector; + RandGenOptions > randgen_opts(selector); + StdVectorFst fst_path; + RandGen(supervision.fst, &fst_path, randgen_opts); + StdVectorFst composed; + Compose(fst_path, reattached_supervision.fst, &composed); + Connect(&composed); + KALDI_ASSERT(composed.NumStates() != 0); + supervision.Check(trans_model); + reattached_supervision.Check(trans_model); +} + + +void TestSupervisionFrames(const Supervision &supervision) { + using namespace fst; + UniformArcSelector selector; + RandGenOptions > randgen_opts(selector); + VectorFst rand_path; + RandGen(supervision.fst, &rand_path, randgen_opts); + std::vector isymbols_out, osymbols_out; + fst::TropicalWeight weight_out; + bool ans = GetLinearSymbolSequence(rand_path, &isymbols_out, &osymbols_out, + &weight_out); + KALDI_ASSERT(ans); + KALDI_ASSERT(isymbols_out == osymbols_out); + KALDI_ASSERT(isymbols_out.size() == + static_cast(supervision.num_sequences * + supervision.frames_per_sequence)); + KALDI_ASSERT(weight_out == fst::TropicalWeight::One()); + + bool test = true; + // make sure epsilon free + KALDI_ASSERT(supervision.fst.Properties(fst::kNoEpsilons, test) != 0); + // make sure acceptor + KALDI_ASSERT(supervision.fst.Properties(fst::kAcceptor, test) != 0); +} + + +void ChainTrainingTest(const DenominatorGraph &den_graph, + const Supervision &supervision) { + int32 num_sequences = supervision.num_sequences, + frames_per_sequence = supervision.frames_per_sequence; + if (frames_per_sequence == 1) // this will break some code. + return; + + CuMatrix nnet_output(num_sequences * frames_per_sequence, + den_graph.NumPdfs()); + + bool zero_output = (RandInt(0, 3) == 0); + if (!zero_output) + nnet_output.SetRandn(); + + ChainTrainingOptions opts; + if (RandInt(0, 1) == 1) + opts.leaky_hmm_coefficient = 0.2; + + CuMatrix nnet_output_deriv(nnet_output.NumRows(), + nnet_output.NumCols(), + kUndefined); + + BaseFloat objf, l2_term, weight; + + ComputeChainObjfAndDeriv(opts, den_graph, supervision, + nnet_output, &objf, &l2_term, &weight, + &nnet_output_deriv); + + { + // make sure each row of nnet_output_deriv sums to one (shift invariance of + // the nnet output). + CuVector nnet_output_deriv_row_sums(nnet_output_deriv.NumRows()); + nnet_output_deriv_row_sums.AddColSumMat(1.0, nnet_output_deriv, 0.0); + KALDI_ASSERT(nnet_output_deriv_row_sums.Norm(2.0) < 0.1); + } + + KALDI_LOG << "Chain objf per frame is " << (objf / weight) + << " over " << weight << " frames (weighted)"; + + { // a check + BaseFloat output_deriv_sum = nnet_output_deriv.Sum(); + KALDI_LOG << "Sum of nnet-output-deriv is " << output_deriv_sum + << " vs. expected 0."; + KALDI_ASSERT(output_deriv_sum < 0.2); + } + + KALDI_ASSERT(objf <= 0.0); + + int32 num_tries = 5; + BaseFloat epsilon = 1.0e-04; + Vector predicted_objf_changes(num_tries), + observed_objf_changes(num_tries); + for (int32 p = 0; p < num_tries; p++) { + CuMatrix nnet_delta_output(nnet_output.NumRows(), + nnet_output.NumCols()); + nnet_delta_output.SetRandn(); + nnet_delta_output.Scale(epsilon); + predicted_objf_changes(p) = TraceMatMat(nnet_output_deriv, + nnet_delta_output, kTrans); + CuMatrix nnet_output_perturbed(nnet_delta_output); + nnet_output_perturbed.AddMat(1.0, nnet_output); + + BaseFloat objf_modified, l2_term_modified, weight_modified; + + ComputeChainObjfAndDeriv(opts, den_graph, supervision, + nnet_output_perturbed, + &objf_modified, &l2_term_modified, + &weight_modified, + NULL); + + observed_objf_changes(p) = objf_modified - objf; + } + KALDI_LOG << "Predicted objf changes are " << predicted_objf_changes; + KALDI_LOG << "Observed objf changes are " << observed_objf_changes; + { + Vector error(predicted_objf_changes); + error.AddVec(-1.0, observed_objf_changes); + KALDI_LOG << "num-sequences = " << num_sequences << ", frames-per-sequence = " + << frames_per_sequence << ", relative accuracy is " + << (error.Norm(2.0) / predicted_objf_changes.Norm(2.0)); + } + + { + // we get inaccuracy for long segments, I think because there is a bias when we + // add random noise for it to increase the likelihood (for winner-take-all reasons) + // and for long utterances this bias adds up over the frames and tends to + // outweigh the random component that the gradient predicts (which will tend to + // cancel). Try to correct for this... + BaseFloat correction = (predicted_objf_changes.Sum() - observed_objf_changes.Sum()) / + predicted_objf_changes.Dim(); + observed_objf_changes.Add(correction); + KALDI_LOG << "Correcting observed objf changes for statistical effects, to " + << observed_objf_changes; + if (frames_per_sequence > 2 && + predicted_objf_changes.Norm(2.0) > 0.1 * epsilon) { + // if we only have the initial and final frames, due to the scaling-down + // of pdfs not in the numerator sequence the derivative might be zero, + // which would cause problems doing the comparison. + // note, epsilon = 1.0e-04. + KALDI_ASSERT(predicted_objf_changes.ApproxEqual(observed_objf_changes, 0.25)); + } + } +} + +void TestSupervisionSplitting(const ContextDependency &ctx_dep, + const TransitionModel &trans_model, + const Supervision &supervision) { + fst::StdVectorFst den_fst, normalization_fst; + ComputeExampleDenFst(ctx_dep, trans_model, &den_fst); + DenominatorGraph den_graph(den_fst, trans_model.NumPdfs()); + den_graph.GetNormalizationFst(den_fst, &normalization_fst); + + SupervisionSplitter splitter(supervision); + int32 num_frames = supervision.num_sequences * supervision.frames_per_sequence, + frames_per_range = RandInt(3, 10); + + std::vector range_starts; + SplitIntoRanges(num_frames, frames_per_range, &range_starts); + int32 num_ranges = range_starts.size(); + std::vector split_supervision(num_ranges); + for (int32 i = 0; i < num_ranges; i++) { + splitter.GetFrameRange(range_starts[i], frames_per_range, + &split_supervision[i]); + bool ans = AddWeightToSupervisionFst(normalization_fst, + &split_supervision[i]); + KALDI_ASSERT(ans); + split_supervision[i].Check(trans_model); + } + if (num_ranges > 0) { + TestSupervisionIo(split_supervision[RandInt(0, num_ranges - 1)]); + TestSupervisionFrames(split_supervision[RandInt(0, num_ranges - 1)]); + + std::vector reattached_supervision; + std::vector to_append(num_ranges); + for (int32 i = 0; i < num_ranges; i++) + to_append[i] = &(split_supervision[i]); + bool compactify = true; + AppendSupervision(to_append, compactify, &reattached_supervision); + KALDI_ASSERT(reattached_supervision.size() == 1); + ChainTrainingTest(den_graph, reattached_supervision[0]); + if (num_frames % frames_per_range == 0) { + TestSupervisionReattached(trans_model, + supervision, + reattached_supervision[0]); + } + } +} + + +void ChainDenominatorTest(const DenominatorGraph &den_graph) { + + int32 num_sequences = RandInt(1, 5), + frames_per_sequence = RandInt(10, 20); + if (RandInt(0, 3) == 0) + frames_per_sequence *= 30; // test how it works on long sequences + CuMatrix nnet_output(num_sequences * frames_per_sequence, + den_graph.NumPdfs()); + + bool zero_output = (RandInt(0, 3) == 0); + if (!zero_output) + nnet_output.SetRandn(); + + ChainTrainingOptions opts; + + DenominatorComputation denominator_computation(opts, den_graph, + num_sequences, nnet_output); + + BaseFloat forward_prob = denominator_computation.Forward(), + per_frame = forward_prob / (num_sequences * frames_per_sequence); + KALDI_LOG << "Forward prob is " << forward_prob + << " = " << per_frame << " per frame."; + + CuMatrix nnet_output_deriv(nnet_output.NumRows(), + nnet_output.NumCols()); + + denominator_computation.Backward(1.0, &nnet_output_deriv); + + + { // a check + BaseFloat output_deriv_sum = nnet_output_deriv.Sum(); + KALDI_LOG << "Sum of nnet-output-deriv is " << output_deriv_sum + << " vs. expected " << (num_sequences * frames_per_sequence); + KALDI_ASSERT(output_deriv_sum - BaseFloat(num_sequences * frames_per_sequence) < + 10.0); + } + + int32 num_tries = 5; + BaseFloat epsilon = 1.0e-04; + Vector predicted_objf_changes(num_tries), + observed_objf_changes(num_tries); + for (int32 p = 0; p < num_tries; p++) { + CuMatrix nnet_delta_output(nnet_output.NumRows(), + nnet_output.NumCols()); + nnet_delta_output.SetRandn(); + nnet_delta_output.Scale(epsilon); + predicted_objf_changes(p) = TraceMatMat(nnet_output_deriv, + nnet_delta_output, kTrans); + CuMatrix nnet_output_perturbed(nnet_delta_output); + nnet_output_perturbed.AddMat(1.0, nnet_output); + + DenominatorComputation denominator_computation_perturbed(opts, den_graph, + num_sequences, + nnet_output_perturbed); + + BaseFloat forward_prob_perturbed = denominator_computation_perturbed.Forward(); + observed_objf_changes(p) = forward_prob_perturbed - forward_prob; + } + KALDI_LOG << "Predicted objf changes are " << predicted_objf_changes; + KALDI_LOG << "Observed objf changes are " << observed_objf_changes; + { + Vector error(predicted_objf_changes); + error.AddVec(-1.0, observed_objf_changes); + KALDI_LOG << "num-sequences = " << num_sequences << ", frames-per-sequence = " + << frames_per_sequence << ", relative error is " + << (error.Norm(2.0) / predicted_objf_changes.Norm(2.0)); + } + if (frames_per_sequence < 50) { + // we get inaccuracy for long segments, I think because there is a bias when we + // add random noise for it to increase the likelihood (for winner-take-all reasons) + // and for long utterances this bias adds up over the frames and tends to + // outweigh the random component that the gradient predicts (which will tend to + // cancel). + KALDI_ASSERT(predicted_objf_changes.ApproxEqual(observed_objf_changes, 0.25)); + } +} + + + +void ChainSupervisionTest() { + ContextDependency *ctx_dep; + TransitionModel *trans_model = GenRandTransitionModel(&ctx_dep); + const std::vector &phones = trans_model->GetPhones(); + + int32 subsample_factor = RandInt(1, 3); + + int32 phone_sequence_length = RandInt(1, 20); + std::vector > phones_durations(phone_sequence_length); + + CompactLattice clat; + int32 cur_state = clat.AddState(); + clat.SetStart(cur_state); + + for (int32 i = 0; i < phone_sequence_length; i++) { + int32 phone = phones[RandInt(0, phones.size() - 1)]; + int32 min_length = trans_model->GetTopo().MinLength(phone), + headroom = 5, + duration = RandInt(subsample_factor * min_length, + subsample_factor * min_length + headroom); + phones_durations[i].first = phone; + phones_durations[i].second = duration; + int32 next_state = clat.AddState(); + std::vector ones(duration, 1); + clat.AddArc(cur_state, + CompactLatticeArc(phone, phone, + CompactLatticeWeight(LatticeWeight::One(), + ones), next_state)); + cur_state = next_state; + } + clat.SetFinal(cur_state, CompactLatticeWeight::One()); + ProtoSupervision proto_sup1, proto_sup2; + SupervisionOptions opts; + opts.frame_subsampling_factor = subsample_factor; + bool ans1 = AlignmentToProtoSupervision(opts, phones_durations, &proto_sup1), + ans2 = PhoneLatticeToProtoSupervision(opts, clat, &proto_sup2); + KALDI_ASSERT(ans1 && ans2); + KALDI_ASSERT(proto_sup1 == proto_sup2); + + Supervision supervision; + if (!ProtoSupervisionToSupervision(*ctx_dep, *trans_model, + proto_sup1, &supervision)) { + // we shouldn't fail because we multiplied by + // 'subsample_factor' when creating the duration. + KALDI_ERR << "Failed creating supervision."; + } + supervision.Check(*trans_model); + TestSupervisionIo(supervision); + TestSupervisionSplitting(*ctx_dep, *trans_model, supervision); + TestSupervisionAppend(*trans_model, supervision); + + { + fst::StdVectorFst den_fst; + ComputeExampleDenFst(*ctx_dep, *trans_model, &den_fst); + DenominatorGraph den_graph(den_fst, trans_model->NumPdfs()); + ChainDenominatorTest(den_graph); + if (RandInt(0, 1) == 0) + supervision.weight = 0.5; + fst::StdVectorFst normalization_fst; + den_graph.GetNormalizationFst(den_fst, &normalization_fst); + // add the weight to the numerator FST so we can assert objf <= 0. + bool ans = AddWeightToSupervisionFst(normalization_fst, &supervision); + KALDI_ASSERT(ans); + // TODO: still have to test for appended sequences. + ChainTrainingTest(den_graph, supervision); + } + + delete ctx_dep; + delete trans_model; +} + +void AddArc(int32 from, int32 to, + fst::StdVectorFst *fst) { + fst->AddArc(from, fst::StdArc(0, 0, fst::TropicalWeight::One(), to)); +} + +void BreadthFirstTest() { + using namespace fst; + StdVectorFst fst; + for (int32 i = 0; i < 6; i++) + fst.AddState(); + fst.SetStart(0); + fst.SetFinal(2, TropicalWeight::One()); + AddArc(0, 3, &fst); + AddArc(0, 4, &fst); + AddArc(4, 5, &fst); + AddArc(3, 5, &fst); + AddArc(5, 1, &fst); + AddArc(1, 2, &fst); + SortBreadthFirstSearch(&fst); + + KALDI_ASSERT(fst.Properties(fst::kTopSorted, true) != 0); + +} + +// this function tests SplitIntoRanges() and GetWeightsForRanges(). +void TestRanges() { + int32 frames_per_range = RandInt(20, 100), + overlap = RandInt(0, 10), + num_frames = RandInt(15, 500); + std::vector range_starts; + SplitIntoRanges(num_frames - overlap, frames_per_range - overlap, + &range_starts); + Vector weights_orig(num_frames), + weights_new(num_frames); + int32 num_ranges = range_starts.size(); + for (int32 i = 0; i < num_ranges; i++) { + int32 start_t = range_starts[i]; + for (int32 j = 0; j < frames_per_range; j++) { + int32 t = start_t + j; + weights_orig(t) += 1.0; + } + } + std::vector > weights; + GetWeightsForRanges(frames_per_range, + range_starts, &weights); + for (int32 i = 0; i < num_ranges; i++) { + KALDI_LOG << "weights[" << i << "] = " + << weights[i]; + int32 start_t = range_starts[i]; + for (int32 j = 0; j < frames_per_range; j++) { + int32 t = start_t + j; + weights_new(t) += weights[i](j); + } + } + KALDI_LOG << "Orig weights are " << weights_orig; + KALDI_LOG << "New weights are " << weights_new; + for (int32 t = 0; t < num_frames; t++) { + if (weights_orig(t) != 0.0) { + KALDI_ASSERT(fabs(weights_new(t) - 1.0) < 0.001); + } else { + KALDI_ASSERT(weights_new(t) == 0.0); + } + } +} + + +} // namespace chain +} // namespace kaldi + +int main() { + using namespace kaldi; + + for (int32 loop = 0; loop < 2; loop++) { +#if HAVE_CUDA == 1 + if (loop == 0) + CuDevice::Instantiate().SelectGpuId("no"); + else + CuDevice::Instantiate().SelectGpuId("yes"); +#endif + for (int32 i = 0; i < 5; i++) { + kaldi::chain::ChainSupervisionTest(); + kaldi::chain::BreadthFirstTest(); + } + kaldi::chain::TestRanges(); +#if HAVE_CUDA == 1 + CuDevice::Instantiate().PrintProfile(); +#endif + } +} diff --git a/src/chain/chain-supervision.cc b/src/chain/chain-supervision.cc new file mode 100644 index 00000000000..3074e9c7742 --- /dev/null +++ b/src/chain/chain-supervision.cc @@ -0,0 +1,831 @@ +// chain/chain-supervision.cc + +// Copyright 2015 Johns Hopkins University (author: Daniel Povey) + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + +#include "chain/chain-supervision.h" +#include "lat/lattice-functions.h" +#include "util/text-utils.h" +#include "hmm/hmm-utils.h" +#include + +namespace kaldi { +namespace chain { + +const int kSupervisionMaxStates = 200000; // we can later make this + // configurable if needed. + +// attempts determinization (with limited max-states) and minimization; +// returns true on success +bool TryDeterminizeMinimize(int32 supervision_max_states, + fst::StdVectorFst *supervision_fst) { + if (supervision_fst->NumStates() >= supervision_max_states) { + KALDI_WARN << "Not attempting determinization as number of states " + << "is too large " << supervision_fst->NumStates(); + return false; + } + fst::DeterminizeOptions opts; + opts.state_threshold = supervision_max_states; + fst::StdVectorFst fst_copy = *supervision_fst; + fst::Determinize(fst_copy, supervision_fst, opts); + // the - 1 here is just because I'm not sure if it stops just before the + // threshold. + if (supervision_fst->NumStates() >= opts.state_threshold - 1) { + KALDI_WARN << "Determinization stopped early after reaching " + << supervision_fst->NumStates() << " states. Likely " + << "this utterance has a very strange transcription."; + return false; + } + fst::Minimize(supervision_fst); + return true; +} + +void ProtoSupervision::Write(std::ostream &os, bool binary) const { + WriteToken(os, binary, ""); + if (!binary) os << "\n"; + int32 num_frames = allowed_phones.size(); + WriteToken(os, binary, ""); + WriteBasicType(os, binary, num_frames); + if (!binary) os << "\n"; + WriteToken(os, binary, ""); + if (!binary) os << "\n"; + for (int32 i = 0; i < num_frames; i++) + WriteIntegerVector(os, binary, allowed_phones[i]); + if (!binary) os << "\n"; + WriteFstKaldi(os, binary, fst); + WriteToken(os, binary, ""); + if (!binary) os << "\n"; +} + +void SupervisionOptions::Check() const { + KALDI_ASSERT(left_tolerance >= 0 && right_tolerance >= 0 && + frame_subsampling_factor > 0 && + left_tolerance + right_tolerance >= frame_subsampling_factor); +} + +bool AlignmentToProtoSupervision(const SupervisionOptions &opts, + const std::vector &phones, + const std::vector &durations, + ProtoSupervision *proto_supervision) { + opts.Check(); + KALDI_ASSERT(phones.size() > 0 && phones.size() == durations.size()); + std::vector labels(phones.size()); + int32 num_frames = std::accumulate(durations.begin(), durations.end(), 0), + factor = opts.frame_subsampling_factor, + num_frames_subsampled = (num_frames + factor - 1) / factor; + proto_supervision->allowed_phones.clear(); + proto_supervision->allowed_phones.resize(num_frames_subsampled); + proto_supervision->fst.DeleteStates(); + if (num_frames_subsampled == 0) + return false; + + int32 current_frame = 0, num_phones = phones.size(); + for (int32 i = 0; i < num_phones; i++) { + int32 phone = phones[i], duration = durations[i]; + KALDI_ASSERT(phone > 0 && duration > 0); + int32 t_start = std::max(0, (current_frame - opts.left_tolerance)), + t_end = std::min(num_frames, + (current_frame + duration + opts.right_tolerance)), + t_start_subsampled = (t_start + factor - 1) / factor, + t_end_subsampled = (t_end + factor - 1) / factor; + + // note: if opts.Check() passed, the following assert should pass too. + KALDI_ASSERT(t_end_subsampled > t_start_subsampled && + t_end_subsampled <= num_frames_subsampled); + for (int32 t_subsampled = t_start_subsampled; + t_subsampled < t_end_subsampled; t_subsampled++) + proto_supervision->allowed_phones[t_subsampled].push_back(phone); + current_frame += duration; + } + KALDI_ASSERT(current_frame == num_frames); + for (int32 t_subsampled = 0; t_subsampled < num_frames_subsampled; + t_subsampled++) { + KALDI_ASSERT(!proto_supervision->allowed_phones[t_subsampled].empty()); + SortAndUniq(&(proto_supervision->allowed_phones[t_subsampled])); + } + fst::MakeLinearAcceptor(phones, &(proto_supervision->fst)); + return true; +} + +bool AlignmentToProtoSupervision( + const SupervisionOptions &opts, + const std::vector > &phones_durations, + ProtoSupervision *proto_supervision) { + KALDI_ASSERT(phones_durations.size() > 0); + std::vector phones(phones_durations.size()), + durations(phones_durations.size()); + for (size_t size = phones_durations.size(), i = 0; i < size; i++) { + phones[i] = phones_durations[i].first; + durations[i] = phones_durations[i].second; + } + return AlignmentToProtoSupervision(opts, phones, durations, + proto_supervision); +} + + +bool ProtoSupervision::operator == (const ProtoSupervision &other) const { + return (allowed_phones == other.allowed_phones && + fst::Equal(fst, other.fst)); +} + +bool PhoneLatticeToProtoSupervision(const SupervisionOptions &opts, + const CompactLattice &lat, + ProtoSupervision *proto_supervision) { + opts.Check(); + if (lat.NumStates() == 0) { + KALDI_WARN << "Empty lattice provided"; + return false; + } + int32 num_states = lat.NumStates(); + proto_supervision->fst.DeleteStates(); + proto_supervision->fst.ReserveStates(num_states); + std::vector state_times; + int32 num_frames = CompactLatticeStateTimes(lat, &state_times), + factor = opts.frame_subsampling_factor, + num_frames_subsampled = (num_frames + factor - 1) / factor; + for (int32 state = 0; state < num_states; state++) + proto_supervision->fst.AddState(); + proto_supervision->fst.SetStart(lat.Start()); + + proto_supervision->allowed_phones.clear(); + proto_supervision->allowed_phones.resize(num_frames_subsampled); + + for (int32 state = 0; state < num_states; state++) { + int32 state_time = state_times[state]; + for (fst::ArcIterator aiter(lat, state); !aiter.Done(); + aiter.Next()) { + const CompactLatticeArc &lat_arc = aiter.Value(); + int32 next_state_time = state_time + lat_arc.weight.String().size(); + int32 phone = lat_arc.ilabel; // It's an acceptor so ilabel == ollabel. + if (phone == 0) { + KALDI_WARN << "CompactLattice has epsilon arc. Unexpected."; + return false; + } + proto_supervision->fst.AddArc(state, + fst::StdArc(phone, phone, + fst::TropicalWeight::One(), + lat_arc.nextstate)); + int32 t_begin = std::max(0, (state_time - opts.left_tolerance)), + t_end = std::min(num_frames, + (next_state_time + opts.right_tolerance)), + t_begin_subsampled = (t_begin + factor - 1)/ factor, + t_end_subsampled = (t_end + factor - 1)/ factor; + for (int32 t_subsampled = t_begin_subsampled; + t_subsampled < t_end_subsampled; t_subsampled++) + proto_supervision->allowed_phones[t_subsampled].push_back(phone); + } + if (lat.Final(state) != CompactLatticeWeight::Zero()) { + proto_supervision->fst.SetFinal(state, fst::TropicalWeight::One()); + if (state_times[state] != num_frames) { + KALDI_WARN << "Time of final state " << state << " in lattice is " + << "not equal to number of frames " << num_frames + << ". Are you sure the lattice is phone-aligned? " + << "Rejecting it."; + return false; + } + } + } + for (int32 t_subsampled = 0; t_subsampled < num_frames_subsampled; + t_subsampled++) { + KALDI_ASSERT(!proto_supervision->allowed_phones[t_subsampled].empty()); + SortAndUniq(&(proto_supervision->allowed_phones[t_subsampled])); + } + return true; +} + + +bool TimeEnforcerFst::GetArc(StateId s, Label ilabel, fst::StdArc* oarc) { + // the following call will do the range-check on 'ilabel'. + int32 phone = trans_model_.TransitionIdToPhone(ilabel); + KALDI_ASSERT(static_cast(s) <= allowed_phones_.size()); + if (static_cast(s) == allowed_phones_.size()) { + // No arcs come from the final state.a + return false; + } + if (std::binary_search(allowed_phones_[s].begin(), + allowed_phones_[s].end(), phone)) { + // the olabel will be a pdf-id plus one, not a transition-id. + int32 pdf_id = trans_model_.TransitionIdToPdf(ilabel); + oarc->ilabel = ilabel; + oarc->olabel = pdf_id + 1; + oarc->weight = fst::TropicalWeight::One(); + oarc->nextstate = s + 1; + return true; + } else { + return false; + } +} + + +bool ProtoSupervisionToSupervision( + const ContextDependencyInterface &ctx_dep, + const TransitionModel &trans_model, + const ProtoSupervision &proto_supervision, + Supervision *supervision) { + using fst::VectorFst; + using fst::StdArc; + VectorFst phone_fst(proto_supervision.fst); + int32 subsequential_symbol = trans_model.GetPhones().back() + 1; + if (ctx_dep.CentralPosition() != ctx_dep.ContextWidth() - 1) { + // note: this function only adds the subseq symbol to the input of what was + // previously an acceptor, so we project, i.e. copy the ilabels to the + // olabels + AddSubsequentialLoop(subsequential_symbol, &phone_fst); + fst::Project(&phone_fst, fst::PROJECT_INPUT); + } + std::vector disambig_syms; // empty list of diambiguation symbols. + fst::ContextFst cfst(subsequential_symbol, trans_model.GetPhones(), + disambig_syms, ctx_dep.ContextWidth(), + ctx_dep.CentralPosition()); + VectorFst context_dep_fst; + fst::ComposeContextFst(cfst, phone_fst, &context_dep_fst); + // at this point, context_dep_fst will have indexes into 'ilabels' as its + // input symbol (representing context-dependent phones), and phones on its + // output. We don't need the phones, so we'll project. + fst::Project(&context_dep_fst, fst::PROJECT_INPUT); + + std::vector disambig_syms_h; // disambiguation symbols on input side + // of H -- will be empty. + + HTransducerConfig h_cfg; + + // We don't want to add any transition probabilities as they will be added + // when we compose with the denominator graph. + h_cfg.transition_scale = 0.0; + h_cfg.push_weights = false; // there's nothing to push. + + + VectorFst *h_fst = GetHTransducer(cfst.ILabelInfo(), + ctx_dep, + trans_model, + h_cfg, + &disambig_syms_h); + KALDI_ASSERT(disambig_syms_h.empty()); + + VectorFst transition_id_fst; + TableCompose(*h_fst, context_dep_fst, &transition_id_fst); + delete h_fst; + + // We don't want to add any transition probabilities as they will be added + // when we compose with the denominator graph. + BaseFloat self_loop_scale = 0.0; + + bool reorder = true; // more efficient in general; won't affect results. + // add self-loops to the FST with transition-ids as its labels. + AddSelfLoops(trans_model, disambig_syms_h, self_loop_scale, reorder, + &transition_id_fst); + + // at this point transition_id_fst will have transition-ids as its ilabels and + // context-dependent phones (indexes into ILabelInfo()) as its olabels. + // Discard the context-dependent phones by projecting on the input, keeping + // only the transition-ids. + fst::Project(&transition_id_fst, fst::PROJECT_INPUT); + if (transition_id_fst.Properties(fst::kIEpsilons, true) != 0) { + // remove epsilons, if there are any. + fst::RmEpsilon(&transition_id_fst); + } + KALDI_ASSERT(transition_id_fst.NumStates() > 0); + + // The last step is to enforce that phones can only appear on the frames they + // are 'allowed' to appear on. This will also convert the FST to have pdf-ids + // plus one as the labels + TimeEnforcerFst enforcer_fst(trans_model, proto_supervision.allowed_phones); + ComposeDeterministicOnDemand(transition_id_fst, + &enforcer_fst, + &(supervision->fst)); + fst::Connect(&(supervision->fst)); + // at this point supervision->fst will have pdf-ids plus one as the olabels, + // but still transition-ids as the ilabels. Copy olabels to ilabels. + fst::Project(&(supervision->fst), fst::PROJECT_OUTPUT); + + KALDI_ASSERT(supervision->fst.Properties(fst::kIEpsilons, true) == 0); + if (supervision->fst.NumStates() == 0) { + KALDI_WARN << "Supervision FST is empty (too many phones for too few " + << "frames?)"; + // possibly there were too many phones for too few frames. + return false; + } + + supervision->weight = 1.0; + supervision->num_sequences = 1; + supervision->frames_per_sequence = proto_supervision.allowed_phones.size(); + supervision->label_dim = trans_model.NumPdfs(); + SortBreadthFirstSearch(&(supervision->fst)); + return true; +} + + + +SupervisionSplitter::SupervisionSplitter( + const Supervision &supervision): + supervision_(supervision), + frame_(supervision_.fst.NumStates(), -1) { + const fst::StdVectorFst &fst(supervision_.fst); + // The fst in struct Supervision is supposed to be epsilon-free and + // topologically sorted; this function relies on those properties to + // set up the frame_ vector (which maps each state in the + // FST to a frame-index 0 <= t < num_frames), and it checks them. + if (supervision_.num_sequences != 1) { + KALDI_WARN << "Splitting already-reattached sequence (only expected in " + << "testing code)"; + } + int32 num_states = fst.NumStates(), + num_frames = supervision_.frames_per_sequence * supervision_.num_sequences; + KALDI_ASSERT(num_states > 0); + int32 start_state = fst.Start(); + // FST should be top-sorted and connected, so start-state must be 0. + KALDI_ASSERT(start_state == 0 && "Expecting start-state to be 0"); + frame_[start_state] = 0; + for (int32 state = 0; state < num_states; state++) { + int32 cur_frame = frame_[state]; + if (cur_frame == -1) { + // If this happens it means the Supervision does not have the required + // properties, e.g. being top-sorted and connected. + KALDI_ERR << "Error computing frame indexes for Supervision"; + } + for (fst::ArcIterator aiter(fst, state); + !aiter.Done(); aiter.Next()) { + const fst::StdArc &arc = aiter.Value(); + // The FST is supposed to be an epsilon-free acceptor. + KALDI_ASSERT(arc.ilabel == arc.olabel && arc.ilabel > 0); + int32 nextstate = arc.nextstate; + KALDI_ASSERT(nextstate >= 0 && nextstate < num_states); + // all arcs go from some t to t + 1. + int32 &next_frame = frame_[nextstate]; + if (next_frame == -1) + next_frame = cur_frame + 1; + else + KALDI_ASSERT(next_frame == cur_frame + 1); + } + } + // The following assert checks that the number of frames in the FST + // matches the num_frames stored in the supervision object; it also relies + // on the topological sorting and connectedness of the FST. + KALDI_ASSERT(frame_.back() == num_frames); + std::vector::iterator iter = frame_.begin(), + end = iter + (frame_.size() - 1); + // check that the frame-indexes of states are monotonically non-decreasing, as + // they should be based on the top-sorting. We rely on this property to + // compute the frame ranges while splitting. + while (iter != end) { + int32 cur_t = *iter; + ++iter; + int32 next_t = *iter; + KALDI_ASSERT(next_t >= cur_t); + } +} + +void SupervisionSplitter::GetFrameRange(int32 begin_frame, int32 num_frames, + Supervision *out_supervision) const { + int32 end_frame = begin_frame + num_frames; + // Note: end_frame is not included in the range of frames that the + // output supervision object covers; it's one past the end. + KALDI_ASSERT(num_frames > 0 && begin_frame >= 0 && + begin_frame + num_frames <= + supervision_.num_sequences * supervision_.frames_per_sequence); + std::vector::const_iterator begin_iter = + std::lower_bound(frame_.begin(), frame_.end(), begin_frame), + end_iter = std::lower_bound(begin_iter, frame_.end(), end_frame); + KALDI_ASSERT(*begin_iter == begin_frame && + (begin_iter == frame_.begin() || begin_iter[-1] < begin_frame)); + // even if end_frame == supervision_.num_frames, there should be a state with + // that frame index. + KALDI_ASSERT(end_iter[-1] < end_frame && + (end_iter < frame_.end() || *end_iter == end_frame)); + int32 begin_state = begin_iter - frame_.begin(), + end_state = end_iter - frame_.begin(); + + CreateRangeFst(begin_frame, end_frame, + begin_state, end_state, &(out_supervision->fst)); + + KALDI_ASSERT(out_supervision->fst.NumStates() > 0); + KALDI_ASSERT(supervision_.num_sequences == 1); + out_supervision->num_sequences = 1; + out_supervision->weight = supervision_.weight; + out_supervision->frames_per_sequence = num_frames; + out_supervision->label_dim = supervision_.label_dim; +} + +void SupervisionSplitter::CreateRangeFst( + int32 begin_frame, int32 end_frame, + int32 begin_state, int32 end_state, + fst::StdVectorFst *fst) const { + // There will be a special pre-start state that has epsilon transitions to all + // states whose frame equals begin_frame; we'll later do RmEpsilon to remove + // these. Next we will include all states begin_state <= s < end_state in the + // output FST, plus (if end_frame != supervision_.num_frames) a special final + // state. All transitions to states >= end_state will be turned into + // a transition to the special final state. There should be no final-probs + // on the states begin_state <= s < end_state. + KALDI_ASSERT(end_state > begin_state); + fst->DeleteStates(); + fst->ReserveStates(end_state - begin_state + 2); + int32 start_state = fst->AddState(); + fst->SetStart(start_state); + for (int32 i = begin_state; i < end_state; i++) + fst->AddState(); + // Add the special final-state. + int32 final_state = fst->AddState(); + fst->SetFinal(final_state, fst::TropicalWeight::One()); + for (int32 state = begin_state; state < end_state; state++) { + int32 output_state = state - begin_state + 1; + if (frame_[state] == begin_frame) { + // we'd like to make this an initial state, but OpenFst doesn't allow + // multiple initial states. Instead we add an epsilon transition to it + // from our actual initial state; we'll later do RmEpsilon and + // determinize. + fst->AddArc(start_state, + fst::StdArc(0, 0, fst::TropicalWeight::One(), + output_state)); + } else { + KALDI_ASSERT(frame_[state] < end_frame); + } + typedef fst::ArcIterator IterType; + for (IterType aiter(supervision_.fst, state); !aiter.Done(); aiter.Next()) { + const fst::StdArc &arc(aiter.Value()); + int32 nextstate = arc.nextstate; + if (nextstate >= end_state) { + // A transition to any state outside the range becomes a transition to + // our special final-state. + fst->AddArc(output_state, + fst::StdArc(arc.ilabel, arc.olabel, + arc.weight, final_state)); + } else { + int32 output_nextstate = arc.nextstate - begin_state + 1; + // note: arc.ilabel should equal arc.olabel and arc.weight should equal + // fst::TropicalWeight::One(). + fst->AddArc(output_state, + fst::StdArc(arc.ilabel, arc.olabel, + arc.weight, output_nextstate)); + } + } + } +} + + +// I couldn't figure out how to do this with OpenFST's native 'visitor' and +// queue mechanisms so I'm just coding this myself. +void SortBreadthFirstSearch(fst::StdVectorFst *fst) { + std::vector state_order(fst->NumStates(), -1); + std::vector seen(fst->NumStates(), false); + int32 start_state = fst->Start(); + KALDI_ASSERT(start_state >= 0); + std::deque queue; + queue.push_back(start_state); + seen[start_state] = true; + int32 num_output = 0; + while (!queue.empty()) { + int32 state = queue.front(); + state_order[state] = num_output++; + queue.pop_front(); + for (fst::ArcIterator aiter(*fst, state); + !aiter.Done(); aiter.Next()) { + int32 nextstate = aiter.Value().nextstate; + if (!seen[nextstate]) { + seen[nextstate] = true; + queue.push_back(nextstate); + } + } + } + if (num_output != fst->NumStates()) + KALDI_ERR << "Input to SortBreadthFirstSearch must be connected."; + fst::StateSort(fst, state_order); +} + + + +void Supervision::Write(std::ostream &os, bool binary) const { + WriteToken(os, binary, ""); + WriteToken(os, binary, ""); + WriteBasicType(os, binary, weight); + WriteToken(os, binary, ""); + WriteBasicType(os, binary, num_sequences); + WriteToken(os, binary, ""); + WriteBasicType(os, binary, frames_per_sequence); + WriteToken(os, binary, ""); + WriteBasicType(os, binary, label_dim); + KALDI_ASSERT(frames_per_sequence > 0 && label_dim > 0 && + num_sequences > 0); + if (binary == false) { + // In text mode, write the FST without any compactification. + WriteFstKaldi(os, binary, fst); + } else { + // Write using StdAcceptorCompactFst, making use of the fact that it's an + // acceptor. + fst::FstWriteOptions write_options(""); + fst::StdCompactAcceptorFst::WriteFst( + fst, fst::AcceptorCompactor(), os, + write_options); + } + WriteToken(os, binary, ""); +} + +void Supervision::Swap(Supervision *other) { + std::swap(weight, other->weight); + std::swap(num_sequences, other->num_sequences); + std::swap(frames_per_sequence, other->frames_per_sequence); + std::swap(label_dim, other->label_dim); + std::swap(fst, other->fst); +} + +void Supervision::Read(std::istream &is, bool binary) { + ExpectToken(is, binary, ""); + ExpectToken(is, binary, ""); + ReadBasicType(is, binary, &weight); + ExpectToken(is, binary, ""); + ReadBasicType(is, binary, &num_sequences); + ExpectToken(is, binary, ""); + ReadBasicType(is, binary, &frames_per_sequence); + ExpectToken(is, binary, ""); + ReadBasicType(is, binary, &label_dim); + if (!binary) { + ReadFstKaldi(is, binary, &fst); + } else { + fst::StdCompactAcceptorFst *compact_fst = + fst::StdCompactAcceptorFst::Read( + is, fst::FstReadOptions(std::string("[unknown]"))); + if (compact_fst == NULL) + KALDI_ERR << "Error reading compact FST from disk"; + fst = *compact_fst; + delete compact_fst; + } + // ReadFstKaldi will work even though we wrote using a compact format. + ExpectToken(is, binary, ""); +} + +int32 ComputeFstStateTimes(const fst::StdVectorFst &fst, + std::vector *state_times) { + if (fst.Start() != 0) // this is implied by our properties. + KALDI_ERR << "Expecting input FST start state to be zero"; + int32 num_states = fst.NumStates(); + int32 total_length = -1; + state_times->clear(); + state_times->resize(num_states, -1); + (*state_times)[0] = 0; + for (int32 state = 0; state < num_states; state++) { + int32 next_state_time = (*state_times)[state] + 1; + if (next_state_time <= 0) // i.e. (*state_times)[state] < 0 + KALDI_ERR << "Input FST does not have required properties."; + for (fst::ArcIterator aiter(fst, state); + !aiter.Done(); aiter.Next()) { + const fst::StdArc &arc = aiter.Value(); + KALDI_ASSERT(arc.ilabel != 0); + int32 &next_state_ref = (*state_times)[arc.nextstate]; + if (next_state_ref == -1) + next_state_ref = next_state_time; + else if (next_state_ref != next_state_time) + KALDI_ERR << "Input FST does not have required properties."; + } + if (fst.Final(state) != fst::TropicalWeight::Zero()) { + if (total_length == -1) + total_length = next_state_time - 1; + else if (total_length != next_state_time - 1) + KALDI_ERR << "Input FST does not have required properties."; + } + } + if (total_length < 0) + KALDI_ERR << "Input FST does not have required properties."; + return total_length; +} + +Supervision::Supervision(const Supervision &other): + weight(other.weight), num_sequences(other.num_sequences), + frames_per_sequence(other.frames_per_sequence), + label_dim(other.label_dim), fst(other.fst) { } + +void AppendSupervision(const std::vector &input, + bool compactify, + std::vector *output_supervision) { + KALDI_ASSERT(!input.empty()); + int32 label_dim = input[0]->label_dim, + num_inputs = input.size(); + if (num_inputs == 1) { + output_supervision->resize(1); + (*output_supervision)[0] = *(input[0]); + return; + } + std::vector output_was_merged; + for (int32 i = 1; i < num_inputs; i++) + KALDI_ASSERT(input[i]->label_dim == label_dim && + "Trying to append incompatible Supervision objects"); + output_supervision->clear(); + output_supervision->reserve(input.size()); + for (int32 i = 0; i < input.size(); i++) { + const Supervision &src = *(input[i]); + if (compactify && !output_supervision->empty() && + output_supervision->back().weight == src.weight && + output_supervision->back().frames_per_sequence == + src.frames_per_sequence) { + // Combine with current output + // append src.fst to output_supervision->fst. + fst::Concat(&output_supervision->back().fst, src.fst); + output_supervision->back().num_sequences++; + output_was_merged.back() = true; + } else { + output_supervision->resize(output_supervision->size() + 1); + output_supervision->back() = src; + output_was_merged.push_back(false); + } + } + KALDI_ASSERT(output_was_merged.size() == output_supervision->size()); + for (size_t i = 0; i < output_supervision->size(); i++) { + if (output_was_merged[i]) { + fst::StdVectorFst &out_fst = (*output_supervision)[i].fst; + // The process of concatenation will have introduced epsilons. + fst::RmEpsilon(&out_fst); + SortBreadthFirstSearch(&out_fst); + } + } +} + +bool AddWeightToSupervisionFst(const fst::StdVectorFst &normalization_fst, + Supervision *supervision) { + // remove epsilons before composing. 'normalization_fst' has noepsilons so + // the composed result will be epsilon free. + fst::StdVectorFst supervision_fst_noeps(supervision->fst); + fst::RmEpsilon(&supervision_fst_noeps); + if (!TryDeterminizeMinimize(kSupervisionMaxStates, + &supervision_fst_noeps)) + return false; + + // note: by default, 'Compose' will call 'Connect', so if the + // resulting FST is not connected, it will end up empty. + fst::StdVectorFst composed_fst; + fst::Compose(supervision_fst_noeps, normalization_fst, + &composed_fst); + if (composed_fst.NumStates() == 0) + return false; + // projection should not be necessary, as both FSTs are acceptors. + // determinize and minimize to make it as compact as possible. + + if (!TryDeterminizeMinimize(kSupervisionMaxStates, + &composed_fst)) + return false; + supervision->fst = composed_fst; + + // Make sure the states are numbered in increasing order of time. + SortBreadthFirstSearch(&(supervision->fst)); + KALDI_ASSERT(supervision->fst.Properties(fst::kAcceptor, true) == fst::kAcceptor); + KALDI_ASSERT(supervision->fst.Properties(fst::kIEpsilons, true) == 0); + return true; +} + +void SplitIntoRanges(int32 num_frames, + int32 frames_per_range, + std::vector *range_starts) { + if (frames_per_range > num_frames) { + range_starts->clear(); + return; // there is no room for even one range. + } + int32 num_ranges = num_frames / frames_per_range, + extra_frames = num_frames % frames_per_range; + // this is a kind of heuristic. If the number of frames we'd + // be skipping is less than 1/4 of the frames_per_range, then + // skip frames; otherwise, duplicate frames. + // it's important that this is <=, not <, so that if + // extra_frames == 0 and frames_per_range is < 4, we + // don't insert an extra range. + if (extra_frames <= frames_per_range / 4) { + // skip frames. we do this at start or end, or between ranges. + std::vector num_skips(num_ranges + 1, 0); + for (int32 i = 0; i < extra_frames; i++) + num_skips[RandInt(0, num_ranges)]++; + range_starts->resize(num_ranges); + int32 cur_start = num_skips[0]; + for (int32 i = 0; i < num_ranges; i++) { + (*range_starts)[i] = cur_start; + cur_start += frames_per_range; + cur_start += num_skips[i + 1]; + } + KALDI_ASSERT(cur_start == num_frames); + } else { + // duplicate frames. + num_ranges++; + int32 num_duplicated_frames = frames_per_range - extra_frames; + // the way we handle the 'extra_frames' frames of output is that we + // backtrack zero or more frames between outputting each pair of ranges, and + // the total of these backtracks equals 'extra_frames'. + std::vector num_backtracks(num_ranges, 0); + for (int32 i = 0; i < num_duplicated_frames; i++) { + // num_ranges - 2 below is not a bug. we only want to backtrack + // between ranges, not past the end of the last range (i.e. at + // position num_ranges - 1). we make the vector one longer to + // simplify the loop below. + num_backtracks[RandInt(0, num_ranges - 2)]++; + } + range_starts->resize(num_ranges); + int32 cur_start = 0; + for (int32 i = 0; i < num_ranges; i++) { + (*range_starts)[i] = cur_start; + cur_start += frames_per_range; + cur_start -= num_backtracks[i]; + } + KALDI_ASSERT(cur_start == num_frames); + } +} + +bool Supervision::operator == (const Supervision &other) const { + return weight == other.weight && num_sequences == other.num_sequences && + frames_per_sequence == other.frames_per_sequence && + label_dim == other.label_dim && fst::Equal(fst, other.fst); +} + +void Supervision::Check(const TransitionModel &trans_mdl) const { + if (weight <= 0.0) + KALDI_ERR << "Weight should be positive."; + if (frames_per_sequence <= 0) + KALDI_ERR << "Invalid frames_per_sequence: " << frames_per_sequence; + if (num_sequences <= 0) + KALDI_ERR << "Invalid num_sequences: " << num_sequences; + if (label_dim != trans_mdl.NumPdfs()) + KALDI_ERR << "Invalid label-dim: " << label_dim + << ", expected " << trans_mdl.NumPdfs(); + std::vector state_times; + if (frames_per_sequence * num_sequences != + ComputeFstStateTimes(fst, &state_times)) + KALDI_ERR << "Num-frames does not match fst."; +} + +void GetWeightsForRanges(int32 range_length, + const std::vector &range_starts, + std::vector > *weights) { + KALDI_ASSERT(range_length > 0); + int32 num_ranges = range_starts.size(); + weights->resize(num_ranges); + for (int32 i = 0; i < num_ranges; i++) { + (*weights)[i].Resize(range_length); + (*weights)[i].Set(1.0); + } + for (int32 i = 0; i + 1 < num_ranges; i++) { + int32 j = i + 1; + int32 i_start = range_starts[i], i_end = i_start + range_length, + j_start = range_starts[j]; + KALDI_ASSERT(j_start > i_start); + if (i_end > j_start) { + Vector &i_weights = (*weights)[i], &j_weights = (*weights)[j]; + + int32 overlap_length = i_end - j_start; + // divide the overlapping piece of the 2 ranges into 3 regions of + // approximately equal size, called the left, middle and right region. + int32 left_length = overlap_length / 3, + middle_length = (overlap_length - left_length) / 2, + right_length = overlap_length - left_length - middle_length; + KALDI_ASSERT(left_length >= 0 && middle_length >= 0 && right_length >= 0 && + left_length + middle_length + right_length == overlap_length); + // set the weight of the left region to be zero for the right (j) range. + for (int32 k = 0; k < left_length; k++) + j_weights(k) = 0.0; + // set the weight of the right region to be zero for the left (i) range. + for (int32 k = 0; k < right_length; k++) + i_weights(range_length - 1 - k) = 0.0; + // for the middle range, linearly interpolate between the 0's and 1's. + // note: we multiply with existing weights instead of set in order to get + // more accurate behavior in the unexpected case where things triply + // overlap. + for (int32 k = 0; k < middle_length; k++) { + BaseFloat weight = (0.5 + k) / middle_length; + j_weights(left_length + k) = weight; + i_weights(range_length - 1 - right_length - k) = weight; + } + } + } +} + + +void GetWeightsForRangesNew(int32 range_length, + int32 num_frames_zeroed, + const std::vector &range_starts, + std::vector > *weights) { + KALDI_ASSERT(range_length > 0 && num_frames_zeroed * 2 < range_length); + int32 num_ranges = range_starts.size(); + weights->resize(num_ranges); + for (int32 i = 0; i < num_ranges; i++) { + (*weights)[i].Resize(range_length); + (*weights)[i].Set(1.0); + } + if (num_frames_zeroed == 0) + return; + for (int32 i = 1; i < num_ranges; i++) + (*weights)[i].Range(0, num_frames_zeroed).Set(0.0); + for (int32 i = 0; i + 1 < num_ranges; i++) + (*weights)[i].Range(range_length - num_frames_zeroed, + num_frames_zeroed).Set(0.0); +} + + +} // namespace chain +} // namespace kaldi diff --git a/src/chain/chain-supervision.h b/src/chain/chain-supervision.h new file mode 100644 index 00000000000..2dda8baf1e4 --- /dev/null +++ b/src/chain/chain-supervision.h @@ -0,0 +1,434 @@ +// chain/chain-supervision.h + +// Copyright 2015 Johns Hopkins University (Author: Daniel Povey) + + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + + +#ifndef KALDI_CHAIN_CHAIN_SUPERVISION_H_ +#define KALDI_CHAIN_CHAIN_SUPERVISION_H_ + +#include +#include + +#include "base/kaldi-common.h" +#include "util/common-utils.h" +#include "lat/kaldi-lattice.h" +#include "fstext/deterministic-fst.h" +#include "hmm/transition-model.h" + +namespace kaldi { +namespace chain { + +/* + This file contains some declarations relating to the object we use to + encode the supervision information for the 'chain' model. + + If we were training the model on whole utterances we could just use the + reference phone sequence, but to make it easier to train on parts of + utterances (and also for efficiency) we use the time-alignment information, + extended by a user-specified margin, to limit the range of frames + that the phones can appear at. +*/ + + +struct SupervisionOptions { + int32 left_tolerance; + int32 right_tolerance; + int32 frame_subsampling_factor; + + SupervisionOptions(): left_tolerance(5), + right_tolerance(5), + frame_subsampling_factor(1) { } + + void Register(OptionsItf *opts) { + opts->Register("left-tolerance", &left_tolerance, "Left tolerance for " + "shift in phone position relative to the alignment"); + opts->Register("right-tolerance", &right_tolerance, "Right tolerance for " + "shift in phone position relative to the alignment"); + opts->Register("frame-subsampling-factor", &frame_subsampling_factor, "Used " + "if the frame-rate for the chain model will be less than the " + "frame-rate of the original alignment. Applied after " + "left-tolerance and right-tolerance are applied (so they are " + "in terms of the original num-frames."); + } + void Check() const; +}; + + +// This is the form that the supervision information for 'chain' models takes +// we compile it to Supervision. +// The normal compilation sequence is: +// (AlignmentToProtoSupervision or PhoneLatticeToProtoSupervision) +// Then you would call ProtoSupervisionToSupervision. + +struct ProtoSupervision { + // a list of (sorted, unique) lists of phones that are allowed + // on each frame. number of frames is allowed_phones.size(), which + // will equal the path length in 'fst'. + std::vector > allowed_phones; + + // The FST of phones; an epsilon-free acceptor. + fst::StdVectorFst fst; + + bool operator == (const ProtoSupervision &other) const; + + // We have a Write but no Read function; this Write function is + // only needed for debugging. + void Write(std::ostream &os, bool binary) const; +}; + +/** Creates a ProtoSupervision from a vector of phones and their durations, + such as might be derived from a training-data alignment (see the function + SplitToPhones()). Note: this probably isn't the normal way you'll do it, + it might be better to start with a phone-aligned lattice so you can capture + the alternative pronunciations; see PhoneLatticeToProtoSupervision(). + Returns true on success (the only possible failure is that total duration < + opts.subsampling_factor). */ +bool AlignmentToProtoSupervision(const SupervisionOptions &opts, + const std::vector &phones, + const std::vector &durations, + ProtoSupervision *proto_supervision); + +/** Creates a ProtoSupervision object from a vector of (phone, duration) pairs + (see the function SplitToPhones()). This does the same jobs as the other + AlignmentToProtoSupervision, from different input. + */ +bool AlignmentToProtoSupervision( + const SupervisionOptions &opts, + const std::vector > &phones_durs, + ProtoSupervision *proto_supervision); + + +/** Creates a proto-supervision from a phone-aligned phone lattice (i.e. a + lattice with phones as the labels, and with the transition-ids aligned with + the phones so you can compute the correct times. The normal path to + create such a lattice would be to generate a lattice containing multiple + pronunciations of the transcript by using steps/align_fmllr_lats.sh or a + similar script, followed by lattice-align-phones + --replace-output-symbols=true. + Returns true on success, and false on failure (the only failure modes are that + the number of frames in the lattice is less than opts.frame_subsampling_factor, + or there are epsilon phones in the lattice, or the final-probs have alignments + on them. +*/ +bool PhoneLatticeToProtoSupervision(const SupervisionOptions &opts, + const CompactLattice &clat, + ProtoSupervision *proto_supervision); + + +/** Modifies the duration information (start_time and end_time) of each phone + instance by the left_tolerance and right_tolerance (being careful not to go + over the edges of the utterance) and then applies frame-rate subsampling by + dividing each frame index in start_times and end_times , and num_frames, by + frame_subsampling_factor. Requires that proto_supervision->num_frames >= + options.frame_subsampling_factor. + +*/ +void ModifyProtoSupervisionTimes(const SupervisionOptions &options, + ProtoSupervision *proto_supervision); + + + +/** + This class wraps the vector of allowed phones for each frame to create a + DeterministicOnDemandFst that we can compose with the decoding-graph FST to + limit the frames on which these phones are allowed to appear. This FST also + helps us convert the labels from transition-ids to (pdf-ids plus one), which + is what we'll be using in the forward-backward (it avoids the need to + keep the transition model around). + + Suppose the number of frames is T, then there will be T+1 states in + this FST, numbered from 0 to T+1, where state 0 is initial and state + T+1 is final. A transition is only allowed from state t to state t+1 + with a particular transition-id as its ilabel, if the corresponding + phone is listed in the 'allowed_phones' for that frame. The olabels + are pdf-ids plus one. + */ +class TimeEnforcerFst: + public fst::DeterministicOnDemandFst { + public: + typedef fst::StdArc::Weight Weight; + typedef fst::StdArc::StateId StateId; + typedef fst::StdArc::Label Label; + + TimeEnforcerFst(const TransitionModel &trans_model, + const std::vector > &allowed_phones): + trans_model_(trans_model), + allowed_phones_(allowed_phones) { } + + // We cannot use "const" because the pure virtual function in the interface is + // not const. + virtual StateId Start() { return 0; } + + virtual Weight Final(StateId s) { + return (s == allowed_phones_.size() ? Weight::One() : Weight::Zero()); + } + + // The ilabel is a transition-id; the state is interpreted as a frame-index. + // The olabel on oarc will be a pdf-id. The state-id is the time index 0 <= t + // <= num_frames. All transitions are to the next frame (but not all are + // allowed). The interface of GetArc requires ilabel to be nonzero (not + // epsilon). + virtual bool GetArc(StateId s, Label ilabel, fst::StdArc* oarc); + + private: + const TransitionModel &trans_model_; + const std::vector > &allowed_phones_; +}; + + +// struct Supervision is the fully-processed supervision information for +// a whole utterance or (after splitting) part of an utterance. It contains the +// time limits on phones encoded into the FST. +struct Supervision { + // The weight of this example (will usually be 1.0). + BaseFloat weight; + + // num_sequences will be 1 if you create a Supervision object from a single + // lattice or alignment, but if you combine multiple Supevision objects + // the 'num_sequences' is the number of objects that were combined (the + // FSTs get appended). + int32 num_sequences; + + // the number of frames in each sequence of appended objects. num_frames * + // num_sequences must equal the path length of any path in the FST. + // Technically this information is redundant with the FST, but it's convenient + // to have it separately. + int32 frames_per_sequence; + + // the maximum possible value of the labels in 'fst' (which go from 1 to + // label_dim). This should equal the NumPdfs() in the TransitionModel object. + // Included to avoid training on mismatched egs. + int32 label_dim; + + // This is an epsilon-free unweighted acceptor that is sorted in increasing + // order of frame index (this implies it's topologically sorted but it's a + // stronger condition). The labels are pdf-ids plus one (to avoid epsilons, + // since pdf-ids are zero-based). Each successful path in 'fst' has exactly + // 'frames_per_sequence * num_sequences' arcs on it (first 'frames_per_sequence' arcs for the + // first sequence; then 'frames_per_sequence' arcs for the second sequence, and so on). + fst::StdVectorFst fst; + + Supervision(): weight(1.0), num_sequences(1), frames_per_sequence(-1), + label_dim(-1) { } + + Supervision(const Supervision &other); + + void Swap(Supervision *other); + + bool operator == (const Supervision &other) const; + + // This function checks that this supervision object satifsies some + // of the properties we expect of it, and calls KALDI_ERR if not. + void Check(const TransitionModel &trans_model) const; + + void Write(std::ostream &os, bool binary) const; + void Read(std::istream &is, bool binary); +}; + + +/** This function creates a Supervision object from a ProtoSupervision object. + The labels will be pdf-ids plus one. It sets supervision->label_dim + trans_model.NumPdfs(). + + It returns true on success, and false on failure; the only failure mode is + that it might return false on that would not be a bug, is when the FST is + empty because there were too many phones for the number of frames. +*/ +bool ProtoSupervisionToSupervision( + const ContextDependencyInterface &ctx_dep, + const TransitionModel &trans_model, + const ProtoSupervision &proto_supervision, + Supervision *supervision); + + +/** + This function sorts the states of the fst argument in an ordering + corresponding with a breadth-first search order starting from the + start state. This gives us the sorting on frame index for the + FSTs that appear in class Supervision (it relies on them being + epsilon-free). + This function requires that the input FST be connected (i.e. all states + reachable from the start state). + This function is called from ProtoSupervisionToSupervision(). +*/ +void SortBreadthFirstSearch(fst::StdVectorFst *fst); + +// This class is used for splitting something of type Supervision into +// multiple pieces corresponding to different frame-ranges. +class SupervisionSplitter { + public: + SupervisionSplitter(const Supervision &supervision); + + // Extracts a frame range of the supervision into 'supervision'. Note: the + // supervision object should not be used for training before you do + // 'AddWeightToSupervisionFst', which not only adds the weights from the + // normalization graph (derived from the normalization FST), but also removes + // epsilons and ensures the states are sorted on time. + void GetFrameRange(int32 begin_frame, int32 frames_per_sequence, + Supervision *supervision) const; + private: + // Creates an output FST covering frames begin_frame <= t < end_frame, + // assuming that the corresponding state-range that we need to + // include, begin_state <= s < end_state has been included. + // (note: the output FST will also have two special initial and final + // states). Does not do the post-processing (RmEpsilon, Determinize, + // TopSort on the result). See code for details. + void CreateRangeFst(int32 begin_frame, int32 end_frame, + int32 begin_state, int32 end_state, + fst::StdVectorFst *fst) const; + + const Supervision &supervision_; + // Indexed by the state-index of 'supervision_.fst', this is the frame-index, + // which ranges from 0 to (supervision_.frames_per_sequence * + // supervision_.num_sequences) - 1. This will be monotonically increasing + // (note that supervision_.fst is topologically sorted). + std::vector frame_; +}; + + +/// This function adds weights to the FST in the supervision object, by +/// composing with the 'normalization fst'. It should be called directly after +/// GetFrameRange(). The 'normalization fst' is produced by the function +/// DenominatorGraph::GetNormalizationFst(); it's a slight modification of the +/// 'denominator fst'. This function modifies the weights in the supervision +/// object- adding to each path, the weight that it gets in the normalization +/// fst, which is the same weight that it will get in the denominator +/// forward-backward computation. This ensures that the (log) objective +/// function can never be positive (as the numerator graph will be a strict +/// subset of the denominator, with the same weights for the same paths). This +/// function returns true on success, and false on the (hopefully) rare occasion +/// that the composition of the normalization fst with the supervision produced +/// an empty result (this shouldn't happen unless there were alignment errors in +/// the alignments used to train the phone language model leading to unseen +/// 3-grams that occur in the training sequences). +/// This function also removes epsilons and makes sure supervision->fst has the +/// required sorting of states. Think of it as the final stage in preparation +/// of the supervision FST. +bool AddWeightToSupervisionFst(const fst::StdVectorFst &normalization_fst, + Supervision *supervision); + +/// Assuming the 'fst' is epsilon-free, connected, and has the property that all +/// paths from the start-state are of the same length, output a vector +/// containing that length (from the start-state to the current state) to +/// 'state_times'. The member 'fst' of struct Supervision has this property. +/// Returns the total number of frames. This function is similar to +/// LatticeStateTimes() and CompactLatticeStateTimes() declared in +/// lat/lattice-functions.h, except that unlike LatticeStateTimes(), we don't +/// allow epsilons-- not because they are hard to handle but because in this +/// context we don't expect them. This function also expects that the input fst +/// will have the property that the state times are in nondecreasing order (as +/// SortBreadthFirstSearch() will accomplish for FSTs satsifying the other +/// properties we mentioned). This just happens to be something we enforce +/// while creating these FSTs. +/// +/// @param fst[in] The input fst: should be epsilon-free; connected; nonempty; +/// should have the property that all paths to a given state (or +/// to a nonzero final-prob) should have the same number of arcs; +/// and its states should be sorted on this path length (e.g. +/// SortBreadthFirst will do this). +/// @param state_times[out] The state times that we output; will be set to +/// a vector with the dimension fst.NumStates(). +/// +/// @return Returns the path length +int32 ComputeFstStateTimes(const fst::StdVectorFst &fst, + std::vector *state_times); + + +/// This function appends a list of supervision objects to create what will +/// usually be a single such object, but if the weights and num-frames are not +/// all the same it will only append Supervision objects where successive ones +/// have the same weight and num-frames, and if 'compactify' is true. The +/// normal use-case for this is when you are combining neural-net examples for +/// training; appending them like this helps to simplify the training process. + +/// This function will crash if the values of label_dim in the inputs are not +/// all the same. +void AppendSupervision(const std::vector &input, + bool compactify, + std::vector *output_supervision); + + +/// This function helps you to pseudo-randomly split a sequence of length 'num_frames', +/// interpreted as frames 0 ... num_frames - 1, into pieces of length exactly +/// 'frames_per_range', to be used as examples for training. Because frames_per_range +/// may not exactly divide 'num_frames', this function will leave either small gaps or +/// small overlaps in pseudo-random places. +/// The output 'range_starts' will be set to a list of the starts of ranges, the +/// output ranges are of the form +/// [ (*range_starts)[i] ... (*range_starts)[i] + frames_per_range - 1 ]. +void SplitIntoRanges(int32 num_frames, + int32 frames_per_range, + std::vector *range_starts); + + +/// This utility function is not used directly in the 'chain' code. It is used +/// to get weights for the derivatives, so that we don't doubly train on some +/// frames after splitting them up into overlapping ranges of frames. The input +/// 'range_starts' will be obtained from 'SplitIntoRanges', but the +/// 'range_length', which is a length in frames, may be longer than the one +/// supplied to SplitIntoRanges, due the 'overlap'. (see the calling code... +/// if we want overlapping ranges, we get it by 'faking' the input to +/// SplitIntoRanges). +/// +/// The output vector 'weights' will be given the same dimension as +/// 'range_starts'. By default the output weights in '*weights' will be vectors +/// of all ones, of length equal to 'range_length', and '(*weights)[i]' represents +/// the weights given to frames numbered +/// t = range_starts[i] ... range_starts[i] + range_length - 1. +/// If these ranges for two successive 'i' values overlap, then we +/// reduce the weights to ensure that no 't' value gets a total weight +/// greater than 1. We do this by dividing the overlapped region +/// into three approximately equal parts, and giving the left part +/// to the left range; the right part to the right range; and +/// in between, interpolating linearly. +void GetWeightsForRanges(int32 range_length, + const std::vector &range_starts, + std::vector > *weights); + + +/// This is a newer version of GetWeightsForRanges with a simpler behavior +/// than GetWeightsForRanges and a different purpose. Instead of aiming to +/// create weights that sum to one over the whole file, the purpose is to +/// zero out the derivative weights for a certain number of frames to each +/// side of every 'cut point' in the numerator lattice [by numerator lattice, +/// what I mean is the FST that we automatically generate from the numerator +/// alignment or lattice]. So we don't zero out the weights for the very +/// beginning or very end of each original utterance, just those where +/// we split the utterance into pieces. We believe there is an incentive +/// for the network to produce deletions near the edges, and this aims to fix +/// this problem. +/// range_length is the length of each range of times (so range_starts[0] +/// represents the start of a range of t values of length 'range_length' +/// and so range_starts[1] etc.), and num_frames_zeroed is the number of frames +/// on each side of the cut point on which we are supposed to zero out the +/// derivative. +void GetWeightsForRangesNew(int32 range_length, + int32 num_frames_zeroed, + const std::vector &range_starts, + std::vector > *weights); + + +typedef TableWriter > SupervisionWriter; +typedef SequentialTableReader > SequentialSupervisionReader; +typedef RandomAccessTableReader > RandomAccessSupervisionReader; + +} // namespace chain +} // namespace kaldi + +#endif // KALDI_CHAIN_CHAIN_SUPERVISION_H_ diff --git a/src/chain/chain-training.cc b/src/chain/chain-training.cc new file mode 100644 index 00000000000..1bf0201fbfa --- /dev/null +++ b/src/chain/chain-training.cc @@ -0,0 +1,115 @@ +// chain/chain-training.cc + +// Copyright 2015 Johns Hopkins University (author: Daniel Povey) + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + +#include "chain/chain-training.h" +#include "chain/chain-kernels-ansi.h" +#include "chain/chain-numerator.h" +#include "chain/chain-denominator.h" + +namespace kaldi { +namespace chain { + +void ComputeChainObjfAndDeriv(const ChainTrainingOptions &opts, + const DenominatorGraph &den_graph, + const Supervision &supervision, + const CuMatrixBase &nnet_output, + BaseFloat *objf, + BaseFloat *l2_term, + BaseFloat *weight, + CuMatrixBase *nnet_output_deriv, + CuMatrixBase *xent_output_deriv) { + BaseFloat num_logprob_weighted; + if (nnet_output_deriv) + nnet_output_deriv->SetZero(); + { + NumeratorComputation numerator(supervision, nnet_output); + // note: supervision.weight is included as a factor in the derivative from + // the numerator object, and the logprob too. + num_logprob_weighted = numerator.Forward(); + if (nnet_output_deriv) { + numerator.Backward(nnet_output_deriv); + if (xent_output_deriv) + xent_output_deriv->CopyFromMat(*nnet_output_deriv); + } else if (xent_output_deriv) { + // this branch will be taken if xent_output_deriv but not + // nnet_output_deriv is set- which could happen if you want to compute the + // cross-entropy objective but not the derivatives. + xent_output_deriv->SetZero(); + numerator.Backward(xent_output_deriv); + } + } + DenominatorComputation denominator(opts, den_graph, + supervision.num_sequences, + nnet_output); + + BaseFloat den_logprob = denominator.Forward(); + bool ok = true; + if (nnet_output_deriv) + ok = denominator.Backward(-supervision.weight, + nnet_output_deriv); + + *objf = num_logprob_weighted - supervision.weight * den_logprob; + *weight = supervision.weight * supervision.num_sequences * + supervision.frames_per_sequence; + if (!((*objf) - (*objf) == 0) || !ok) { + // inf or NaN detected, or denominator computation returned false. + if (nnet_output_deriv) + nnet_output_deriv->SetZero(); + if (xent_output_deriv) + xent_output_deriv->SetZero(); + BaseFloat default_objf = -10; + KALDI_WARN << "Objective function is " << (*objf) + << " and denominator computation (if done) returned " + << std::boolalpha << ok + << ", setting objective function to " << default_objf + << " per frame."; + *objf = default_objf * *weight; + } + + // This code helps us see how big the derivatives are, on average, + // for different frames of the sequences. As expected, they are + // smaller towards the edges of the sequences (due to the penalization + // of 'incorrect' pdf-ids. + if (GetVerboseLevel() >= 1) { + int32 tot_frames = nnet_output_deriv->NumRows(), + frames_per_sequence = supervision.frames_per_sequence, + num_sequences = supervision.num_sequences; + CuVector row_products(tot_frames); + row_products.AddDiagMat2(1.0, *nnet_output_deriv, kNoTrans, 0.0); + Vector row_products_cpu(row_products); + Vector row_products_per_frame(frames_per_sequence); + for (int32 i = 0; i < tot_frames; i++) + row_products_per_frame(i / num_sequences) += row_products_cpu(i); + KALDI_LOG << "Derivs per frame are " << row_products_per_frame; + } + + if (opts.l2_regularize == 0.0) { + *l2_term = 0.0; + } else { + // compute the l2 penalty term and its derivative + BaseFloat scale = supervision.weight * opts.l2_regularize; + *l2_term = -0.5 * scale * TraceMatMat(nnet_output, nnet_output, kTrans); + if (nnet_output_deriv) + nnet_output_deriv->AddMat(-1.0 * scale, nnet_output); + } +} + + +} // namespace chain +} // namespace kaldi diff --git a/src/chain/chain-training.h b/src/chain/chain-training.h new file mode 100644 index 00000000000..e6143d10846 --- /dev/null +++ b/src/chain/chain-training.h @@ -0,0 +1,131 @@ +// chain/chain-training.h + +// Copyright 2015 Johns Hopkins University (Author: Daniel Povey) + + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + + +#ifndef KALDI_CHAIN_CHAIN_TRAINING_H_ +#define KALDI_CHAIN_CHAIN_TRAINING_H_ + +#include +#include + +#include "base/kaldi-common.h" +#include "util/common-utils.h" +#include "fstext/fstext-lib.h" +#include "tree/context-dep.h" +#include "lat/kaldi-lattice.h" +#include "matrix/kaldi-matrix.h" +#include "hmm/transition-model.h" +#include "chain/chain-den-graph.h" +#include "chain/chain-supervision.h" + +namespace kaldi { +namespace chain { + + +struct ChainTrainingOptions { + // l2 regularization constant on the 'chain' output; the actual term added to + // the objf will be -0.5 times this constant times the squared l2 norm. + // (squared so it's additive across the dimensions). e.g. try 0.0005. + BaseFloat l2_regularize; + + // Coefficient for 'leaky hmm'. This means we have an epsilon-transition from + // each state to a special state with probability one, and then another + // epsilon-transition from that special state to each state, with probability + // leaky_hmm_coefficient times [initial-prob of destination state]. Imagine + // we make two copies of each state prior to doing this, version A and version + // B, with transition from A to B, so we don't have to consider epsilon loops- + // or just imagine the coefficient is small enough that we can ignore the + // epsilon loops. + BaseFloat leaky_hmm_coefficient; + + + // Cross-entropy regularization constant. (e.g. try 0.1). If nonzero, + // the network is expected to have an output named 'output-xent', which + // should have a softmax as its final nonlinearity. + BaseFloat xent_regularize; + + ChainTrainingOptions(): l2_regularize(0.0), leaky_hmm_coefficient(1.0e-05), + xent_regularize(0.0) { } + + void Register(OptionsItf *opts) { + opts->Register("l2-regularize", &l2_regularize, "l2 regularization " + "constant for 'chain' training, applied to the output " + "of the neural net."); + opts->Register("leaky-hmm-coefficient", &leaky_hmm_coefficient, "Coefficient " + "that allows transitions from each HMM state to each other " + "HMM state, to ensure gradual forgetting of context (can " + "improve generalization). For numerical reasons, may not be " + "exactly zero."); + opts->Register("xent-regularize", &xent_regularize, "Cross-entropy " + "regularization constant for 'chain' training. If " + "nonzero, the network is expected to have an output " + "named 'output-xent', which should have a softmax as " + "its final nonlinearity."); + } +}; + + +/** + This function does both the numerator and denominator parts of the 'chain' + computation in one call. + + @param [in] opts Struct containing options + @param [in] den_graph The denominator graph, derived from denominator fst. + @param [in] supervision The supervision object, containing the supervision + paths and constraints on the alignment as an FST + @param [in] nnet_output The output of the neural net; dimension must equal + ((supervision.num_sequences * supervision.frames_per_sequence) by + den_graph.NumPdfs()). The rows are ordered as: all sequences + for frame 0; all sequences for frame 1; etc. + @param [out] objf The [num - den] objective function computed for this + example; you'll want to divide it by 'tot_weight' before + displaying it. + @param [out] l2_term The l2 regularization term in the objective function, if + the --l2-regularize option is used. To be added to 'o + @param [out] weight The weight to normalize the objective function by; + equals supervision.weight * supervision.num_sequences * + supervision.frames_per_sequence. + @param [out] nnet_output_deriv The derivative of the objective function w.r.t. + the neural-net output. Only written to if non-NULL. + You don't have to zero this before passing to this function, + we zero it internally. + @param [out] xent_output_deriv If non-NULL, then the numerator part of the derivative + (which equals a posterior from the numerator forward-backward, + scaled by the supervision weight) is written to here. This will + be used in the cross-entropy regularization code. This value + is also used in computing the cross-entropy objective value. +*/ +void ComputeChainObjfAndDeriv(const ChainTrainingOptions &opts, + const DenominatorGraph &den_graph, + const Supervision &supervision, + const CuMatrixBase &nnet_output, + BaseFloat *objf, + BaseFloat *l2_term, + BaseFloat *weight, + CuMatrixBase *nnet_output_deriv, + CuMatrixBase *xent_output_deriv = NULL); + + + +} // namespace chain +} // namespace kaldi + +#endif // KALDI_CHAIN_CHAIN_TRAINING_H_ + diff --git a/src/chain/context-dep-topology.h b/src/chain/context-dep-topology.h new file mode 100644 index 00000000000..5eae267a5cf --- /dev/null +++ b/src/chain/context-dep-topology.h @@ -0,0 +1,129 @@ +// chain/context-dep-topology.h + +// Copyright 2015 Johns Hopkins University (Author: Daniel Povey) + + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + +#ifndef KALDI_CHAIN_CONTEXT_DEP_TOPOLOGY_H_ +#define KALDI_CHAIN_CONTEXT_DEP_TOPOLOGY_H_ + +#include +#include + +#include "base/kaldi-common.h" +#include "util/common-utils.h" +#include "fstext/fstext-lib.h" +#include "chain/phone-topology.h" +#include "chain/phone-context.h" + +namespace kaldi { +namespace chain { + + +/** + The 'ContextDepTopology' object is responsible for combining the + 'PhoneTopology' model, which describes the quasi-HMM topology for each phone, + and the 'PhoneContext' model, which describes how we create left-context + dependent phones. It also allocates 'graph-labels' and 'output-labels'. It + is analogous to 'HC' in the 'HCLG' recipe. It's of a manageable size as an + FST, because we limit ourselves to left context. + + A 'graph-label' is one-based, is sufficient to identify the logical CD-phone + and the label in the topology, and can also be mapped to an 'output-label'. + + The output-label is also one-based; it is sufficient to identify the physical + CD-phone and the label in the topology object, but won't let you identify + the monophone (because output-labels may be shared between monophones). + + The neural-net output is indexed by the output-label minus one (to form + a zero-based index). +*/ + +class ContextDepTopology { + public: + + ContextDepTopology(); + + ContextDepTopology(const PhoneTopology &topology, + const PhoneContext &context); + + const PhoneTopology &GetPhoneTopology() { return phone_topology_; } + + const PhoneContext &GetPhoneContext() { return phone_context_; } + + // Returns the number of output-labels (labels corresponding to the neural-net + // output). The actual neural-net output matrix is indexed by the label minus + // one, which we call an output-index. + int32 NumOutputLabels(); + + // Returns the number of graph-labels. A graph-label is what will typically + // appear in HCLG decoding graphs; it is mappable to an output-label, but we + // also ensure that it is mappable to a phone. + int32 NumGraphLabels(); + + // convenience function to return the number of phones. + int32 NumPhones() { return phone_topology_.NumPhones(); } + + // maps a graph-label to an output-label. + int32 GraphLabelToOutputLabel(int32 graph_label); + + // maps a graph label to a phone. + int32 GraphLabelToPhone(int32 graph_label); + + // maps a graph label to a logical cd-phone [a logical cd-phone is always + // mappable to the monophone]. + int32 GraphLabelToLogicalCdPhone(int32 graph_label); + + // maps a graph label to a physical cd-phone, as defined by the PhoneContext + // object. + int32 GraphLabelToPhysicalCdPhone(int32 graph_label); + + // maps a graph label to a label in the phone's topology object (needed to + // work out phone alignments). + int32 GraphLabelToTopologyLabel(int32 graph_label); + + // Outputs to 'output' an FST that represents this object-- it's essentially + // the 'HC' object in the 'HCLG' recipe. It's an unweighted transducer where + // the input labels are phones (or epsilon) and the output labels are + // 'graph-labels'. Note: we will ensure that there are no epsilons on + // the 'output side'. + void GetAsFst(fst::VectorFst* output) const; + + // This variant of of GetAsFst gives you 'output-labels' as the olabels, instead + // of graph-labels. These are indexes-into-the-nnet-output plus one. + void GetAsFstWithOutputLabels(fst::VectorFst* output) const; + + void Write(std::ostream &os, bool binary) const; + + void Read(std::istream &is, bool binary); + + private: + PhoneTopology phone_topology_; + PhoneContext phone_context_; + + struct GraphLabelInfo { + int32 logical_cd_phone; + int32 topology_label; + int32 output_label; + }; +}; + + +} // namespace chain +} // namespace kaldi + +#endif // KALDI_CHAIN_CONTEXT_DEP_TOPOLOGY_H_ diff --git a/src/chain/language-model-test.cc b/src/chain/language-model-test.cc new file mode 100644 index 00000000000..04a57441ada --- /dev/null +++ b/src/chain/language-model-test.cc @@ -0,0 +1,112 @@ +// chain/language-model-test.cc + +// Copyright 2015 Johns Hopkins University (author: Daniel Povey) + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + +#include "chain/language-model.h" +#include "fstext/fstext-utils.h" + +namespace kaldi { +namespace chain { + +static void GetTestingData(int32 *vocab_size, + std::vector > *data) { + // read the code of a C++ file as training data. + bool binary; + Input input("language-model.cc", &binary); + KALDI_ASSERT(!binary); + std::istream &is = input.Stream(); + std::string line; + *vocab_size = 127; + int32 line_count = 0; + for (; getline(is, line); line_count++) { + std::vector int_line(line.size()); + for (size_t i = 0; i < line.size(); i++) { + int32 this_char = line[i]; + if (this_char == 0) { + this_char = 1; // should never happen, but just make sure, as 0 is + // treated as BOS/EOS in the language modeling code. + } + int_line[i] = std::min(127, this_char); + } + data->push_back(int_line); + } + KALDI_ASSERT(line_count > 0); +} + + +void ShowPerplexity(const fst::StdVectorFst &fst, + const std::vector > &data) { + int64 num_phones = 0; + double tot_loglike = 0; + for (size_t i = 0; i < data.size(); i++) { + num_phones += data[i].size(); + fst::StdVectorFst linear_fst; + MakeLinearAcceptor(data[i], &linear_fst); + fst::StdVectorFst composed_fst; + fst::Compose(linear_fst, fst, &composed_fst); + fst::TropicalWeight weight = fst::ShortestDistance(composed_fst); + KALDI_ASSERT(weight != fst::TropicalWeight::Zero()); + tot_loglike -= weight.Value(); + } + double perplexity = exp(-(tot_loglike / num_phones)); + KALDI_LOG << "Perplexity over " << num_phones + << " phones (of training data) is " << perplexity; +} + + +void LanguageModelTest() { + int32 vocab_size; + std::vector > data; + GetTestingData(&vocab_size, &data); + + LanguageModelOptions opts; + opts.no_prune_ngram_order = RandInt(1, 3); + opts.ngram_order = opts.no_prune_ngram_order + RandInt(0, 3); + opts.num_extra_lm_states = RandInt(1, 200); + if (opts.ngram_order < 2) + opts.ngram_order = 2; + if (RandInt(1, 2) == 1) + opts.num_extra_lm_states *= 10; + + LanguageModelEstimator estimator(opts); + for (size_t i = 0; i < data.size(); i++) { + std::vector &sentence = data[i]; + estimator.AddCounts(sentence); + } + + fst::StdVectorFst fst; + estimator.Estimate(&fst); + bool ans = IsStochasticFstInLog(fst); + KALDI_ASSERT(ans); // check that it normalizes. + KALDI_ASSERT(fst.Properties(fst::kAcceptor, true) == fst::kAcceptor); + KALDI_ASSERT(fst.Properties(fst::kIDeterministic, true) == fst::kIDeterministic); + KALDI_ASSERT(fst.Properties(fst::kIEpsilons, true) == 0); + + ShowPerplexity(fst, data); +} + + + +} // namespace chain +} // namespace kaldi + +int main() { + // kaldi::SetVerboseLevel(2); + for (int32 i = 0; i < 30; i++) + kaldi::chain::LanguageModelTest(); +} diff --git a/src/chain/language-model.cc b/src/chain/language-model.cc new file mode 100644 index 00000000000..f144d3d1bc1 --- /dev/null +++ b/src/chain/language-model.cc @@ -0,0 +1,411 @@ +// chain/language-model.cc + +// Copyright 2015 Johns Hopkins University (author: Daniel Povey) + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include "chain/language-model.h" +#include "util/simple-io-funcs.h" + + +namespace kaldi { +namespace chain { + +void LanguageModelEstimator::AddCounts(const std::vector &sentence) { + KALDI_ASSERT(opts_.ngram_order >= 2 && "--ngram-order must be >= 2"); + KALDI_ASSERT(opts_.ngram_order >= opts_.no_prune_ngram_order); + int32 order = opts_.ngram_order; + // 0 is used for left-context at the beginning of the file.. treat it as BOS. + std::vector history(1, 0); + std::vector::const_iterator iter = sentence.begin(), + end = sentence.end(); + for (; iter != end; ++iter) { + KALDI_ASSERT(*iter != 0); + IncrementCount(history, *iter); + history.push_back(*iter); + if (history.size() >= order) + history.erase(history.begin()); + } + // Probability of end of sentence. This will end up getting ignored later, but + // it still makes a difference for probability-normalization reasons. + IncrementCount(history, 0); +} + +void LanguageModelEstimator::IncrementCount(const std::vector &history, + int32 next_phone) { + int32 lm_state_index = FindOrCreateLmStateIndexForHistory(history); + if (lm_states_[lm_state_index].tot_count == 0) { + num_active_lm_states_++; + } + lm_states_[lm_state_index].AddCount(next_phone, 1); +} + +void LanguageModelEstimator::SetParentCounts() { + int32 num_lm_states = lm_states_.size(); + for (int32 l = 0; l < num_lm_states; l++) { + int32 this_count = lm_states_[l].tot_count; + int32 l_iter = l; + while (l_iter != -1) { + lm_states_[l_iter].tot_count_with_parents += this_count; + l_iter = lm_states_[l_iter].backoff_lmstate_index; + } + } + for (int32 l = 0; l < num_lm_states; l++) { + KALDI_ASSERT(lm_states_[l].tot_count_with_parents >= + lm_states_[l].tot_count); + } +} + +int32 LanguageModelEstimator::CheckActiveStates() const { + int32 num_active_states = 0, + num_lm_states = lm_states_.size(), + num_basic_lm_states = 0; + for (int32 l = 0; l < num_lm_states; l++) { + if (lm_states_[l].tot_count != 0) + num_active_states++; + if (lm_states_[l].history.size() == opts_.no_prune_ngram_order - 1) + num_basic_lm_states++; + } + KALDI_ASSERT(num_active_states == num_active_lm_states_); + return num_basic_lm_states; +} + +int32 LanguageModelEstimator::FindLmStateIndexForHistory( + const std::vector &hist) const { + MapType::const_iterator iter = hist_to_lmstate_index_.find(hist); + if (iter == hist_to_lmstate_index_.end()) + return -1; + else + return iter->second; +} + +int32 LanguageModelEstimator::FindNonzeroLmStateIndexForHistory( + std::vector hist) const { + while (1) { + int32 l = FindLmStateIndexForHistory(hist); + if (l == -1 || lm_states_[l].tot_count == 0) { + // no such state or state has zero count. + if (hist.empty()) + KALDI_ERR << "Error looking up LM state index for history " + << "(likely code bug)"; + hist.erase(hist.begin()); // back off. + } else { + return l; + } + } +} + +int32 LanguageModelEstimator::FindOrCreateLmStateIndexForHistory( + const std::vector &hist) { + MapType::const_iterator iter = hist_to_lmstate_index_.find(hist); + if (iter != hist_to_lmstate_index_.end()) + return iter->second; + int32 ans = lm_states_.size(); // index of next element + // next statement relies on default construct of LmState. + lm_states_.resize(lm_states_.size() + 1); + lm_states_.back().history = hist; + hist_to_lmstate_index_[hist] = ans; + // make sure backoff_lmstate_index is set, if needed. + if (hist.size() >= opts_.no_prune_ngram_order) { + // we need a backoff state to exist- create one if needed. + std::vector backoff_hist(hist.begin() + 1, + hist.end()); + + int32 backoff_lm_state = FindOrCreateLmStateIndexForHistory( + backoff_hist); + lm_states_[ans].backoff_lmstate_index = backoff_lm_state; + hist_to_lmstate_index_[backoff_hist] = backoff_lm_state; + } + return ans; +} + +void LanguageModelEstimator::LmState::AddCount(int32 phone, int32 count) { + std::map::iterator iter = phone_to_count.find(phone); + if (iter == phone_to_count.end()) + phone_to_count[phone] = count; + else + iter->second += count; + tot_count += count; +} + +void LanguageModelEstimator::LmState::Add(const LmState &other) { + KALDI_ASSERT(&other != this); + std::map::const_iterator iter = other.phone_to_count.begin(), + end = other.phone_to_count.end(); + for (; iter != end; ++iter) + AddCount(iter->first, iter->second); +} + +void LanguageModelEstimator::LmState::Clear() { + phone_to_count.clear(); + tot_count = 0; + tot_count_with_parents = false; + backoff_allowed = false; +} + +BaseFloat LanguageModelEstimator::LmState::LogLike() const { + double ans = 0.0; + int32 tot_count_check = 0; + std::map::const_iterator iter = phone_to_count.begin(), + end = phone_to_count.end(); + for (; iter != end; ++iter) { + int32 count = iter->second; + tot_count_check += count; + double prob = count * 1.0 / tot_count; + ans += log(prob) * count; + } + KALDI_ASSERT(tot_count_check == tot_count); + return ans; +} + +void LanguageModelEstimator::InitializeQueue() { + int32 num_lm_states = lm_states_.size(); + while (!queue_.empty()) queue_.pop(); + for (int32 l = 0; l < num_lm_states; l++) { + lm_states_[l].backoff_allowed = BackoffAllowed(l); + if (lm_states_[l].backoff_allowed) { + BaseFloat like_change = BackoffLogLikelihoodChange(l); + queue_.push(std::pair(like_change, l)); + } + } +} + +BaseFloat LanguageModelEstimator::BackoffLogLikelihoodChange( + int32 l) const { + const LmState &lm_state = lm_states_.at(l); + KALDI_ASSERT(lm_state.backoff_allowed && lm_state.backoff_lmstate_index >= 0); + const LmState &backoff_lm_state = lm_states_.at( + lm_state.backoff_lmstate_index); + KALDI_ASSERT(lm_state.tot_count != 0); + // if the backoff state has zero count, there would naturally be a zero + // cost, but return -1e15 * (count of this lm state)... this encourages the + // lowest-count state to be backed off first. + if (backoff_lm_state.tot_count == 0) + return -1.0e-15 * lm_state.tot_count; + LmState sum_state(backoff_lm_state); + sum_state.Add(lm_state); + BaseFloat log_like_change = + sum_state.LogLike() - + lm_state.LogLike() - + backoff_lm_state.LogLike(); + // log-like change should not be positive... give it a margin for round-off + // error. + KALDI_ASSERT(log_like_change < 0.1); + if (log_like_change > 0.0) + log_like_change = 0.0; + return log_like_change; +} + + +void LanguageModelEstimator::DoBackoff() { + int32 initial_active_states = num_active_lm_states_, + target_num_lm_states = num_basic_lm_states_ + opts_.num_extra_lm_states; + + // create 3 intermediate targets and the final target. Between each phase we'll + // do InitializeQueue(), which will get us more exact values. + int32 num_targets = 4; + std::vector targets(num_targets); + for (int32 t = 0; t < num_targets; t++) { + // the targets get progressively closer to target_num_lm_states; + targets[t] = initial_active_states + + ((target_num_lm_states - initial_active_states) * (t + 1)) / num_targets; + } + KALDI_ASSERT(targets.back() == target_num_lm_states); + + for (int32 t = 0; t < num_targets; t++) { + KALDI_VLOG(2) << "Backing off states, stage " << t; + InitializeQueue(); + int32 this_target = targets[t]; + while (num_active_lm_states_ > this_target && !queue_.empty()) { + BaseFloat like_change = queue_.top().first; + int32 lm_state = queue_.top().second; + queue_.pop(); + BaseFloat recomputed_like_change = BackoffLogLikelihoodChange(lm_state); + if (!ApproxEqual(like_change, recomputed_like_change)) { + // If it changed (i.e. we had a stale likelihood-change on the queue), + // just put back the recomputed like-change on the queue and make no other + // changes. + KALDI_VLOG(2) << "Not backing off state, since like-change changed from " + << like_change << " to " << recomputed_like_change; + queue_.push(std::pair(recomputed_like_change, lm_state)); + } else { + KALDI_VLOG(2) << "Backing off state with like-change = " + << recomputed_like_change; + BackOffState(lm_state); + } + } + } + KALDI_LOG << "In LM [hard] backoff, target num states was " + << num_basic_lm_states_ << " + --num-extra-lm-states=" + << opts_.num_extra_lm_states << " = " << target_num_lm_states + << ", pruned from " << initial_active_states << " to " + << num_active_lm_states_; +} + +void LanguageModelEstimator::BackOffState(int32 l) { + LmState &lm_state = lm_states_.at(l); + KALDI_ASSERT(lm_state.backoff_allowed); + KALDI_ASSERT(lm_state.backoff_lmstate_index >= 0); + KALDI_ASSERT(lm_state.tot_count > 0); // or shouldn't be backing it off. + LmState &backoff_lm_state = lm_states_.at(lm_state.backoff_lmstate_index); + bool backoff_state_had_backoff_allowed = backoff_lm_state.backoff_allowed; + if (backoff_lm_state.tot_count != 0) + num_active_lm_states_--; + // add the counts of lm_state to backoff_lm_state. + backoff_lm_state.Add(lm_state); + // zero the counts in this lm_state. + lm_state.Clear(); + backoff_lm_state.backoff_allowed = BackoffAllowed( + lm_state.backoff_lmstate_index); + + if (!backoff_state_had_backoff_allowed && + backoff_lm_state.backoff_allowed) { + // the backoff state would not have been in the queue, but is now allowed in + // the queue. + BaseFloat backoff_like_change = BackoffLogLikelihoodChange( + lm_state.backoff_lmstate_index); + queue_.push(std::pair(backoff_like_change, + lm_state.backoff_lmstate_index)); + } +} + +int32 LanguageModelEstimator::AssignFstStates() { + CheckActiveStates(); + int32 num_lm_states = lm_states_.size(); + int32 current_fst_state = 0; + for (int32 l = 0; l < num_lm_states; l++) + if (lm_states_[l].tot_count != 0) + lm_states_[l].fst_state = current_fst_state++; + KALDI_ASSERT(current_fst_state == num_active_lm_states_); + return current_fst_state; +} + +void LanguageModelEstimator::Estimate(fst::StdVectorFst *fst) { + KALDI_LOG << "Estimating language model with --no-prune-ngram-order=" + << opts_.no_prune_ngram_order << ", --ngram-order=" + << opts_.ngram_order << ", --num-extra-lm-state=" + << opts_.num_extra_lm_states; + SetParentCounts(); + num_basic_lm_states_ = CheckActiveStates(); + DoBackoff(); + int32 num_fst_states = AssignFstStates(); + OutputToFst(num_fst_states, fst); +} + +int32 LanguageModelEstimator::FindInitialFstState() const { + std::vector history(1, 0); + int32 l = FindNonzeroLmStateIndexForHistory(history); + KALDI_ASSERT(l != -1 && lm_states_[l].fst_state != -1); + return lm_states_[l].fst_state; +} + + +bool LanguageModelEstimator::BackoffAllowed(int32 l) const { + const LmState &lm_state = lm_states_.at(l); + if (lm_state.history.size() < opts_.no_prune_ngram_order) + return false; + KALDI_ASSERT(lm_state.tot_count <= lm_state.tot_count_with_parents); + if (lm_state.tot_count != lm_state.tot_count_with_parents) + return false; + if (lm_state.tot_count == 0) + return false; + // the next if-statement is an optimization where we skip the + // following test if we know that it must always be true. + if (lm_state.history.size() == opts_.ngram_order - 1) + return true; + std::map::const_iterator + iter = lm_state.phone_to_count.begin(), + end = lm_state.phone_to_count.end(); + for (; iter != end; ++iter) { + int32 phone = iter->first; + if (phone != 0) { + std::vector next_hist(lm_state.history); + next_hist.push_back(phone); + int32 next_lmstate = FindLmStateIndexForHistory(next_hist); + if (next_lmstate != -1 && + lm_states_[next_lmstate].tot_count_with_parents != 0) { + // backoff is not allowed because we need all the context we have + // in order to make this transition; we can't afford to discard + // the leftmost phone. + return false; + } + } + } + return true; +} + +void LanguageModelEstimator::OutputToFst( + int32 num_states, + fst::StdVectorFst *fst) const { + KALDI_ASSERT(num_states == num_active_lm_states_); + fst->DeleteStates(); + for (int32 i = 0; i < num_states; i++) + fst->AddState(); + fst->SetStart(FindInitialFstState()); + + int64 tot_count = 0; + double tot_logprob = 0.0; + + int32 num_lm_states = lm_states_.size(); + // note: not all lm-states end up being 'active'. + for (int32 l = 0; l < num_lm_states; l++) { + const LmState &lm_state = lm_states_[l]; + if (lm_state.fst_state == -1) + continue; + int32 state_count = lm_state.tot_count; + KALDI_ASSERT(state_count != 0); + std::map::const_iterator + iter = lm_state.phone_to_count.begin(), + end = lm_state.phone_to_count.end(); + for (; iter != end; ++iter) { + int32 phone = iter->first, count = iter->second; + BaseFloat logprob = log(count * 1.0 / state_count); + tot_count += count; + tot_logprob += logprob * count; + if (phone == 0) { // Interpret as final-prob. + fst->SetFinal(lm_state.fst_state, fst::TropicalWeight(-logprob)); + } else { // It becomes a transition. + std::vector next_history(lm_state.history); + next_history.push_back(phone); + int32 dest_lm_state = FindNonzeroLmStateIndexForHistory(next_history), + dest_fst_state = lm_states_[dest_lm_state].fst_state; + KALDI_ASSERT(dest_fst_state != -1); + fst->AddArc(lm_state.fst_state, + fst::StdArc(phone, phone, fst::TropicalWeight(-logprob), + dest_fst_state)); + } + } + } + BaseFloat perplexity = exp(-(tot_logprob / tot_count)); + KALDI_LOG << "Total number of phone instances seen was " << tot_count; + KALDI_LOG << "Perplexity on training data is: " << perplexity; + KALDI_LOG << "Note: perplexity on unseen data will be infinity as there is " + << "no smoothing. This is by design, to reduce the number of arcs."; + fst::Connect(fst); + // Make sure that Connect does not delete any states. + int32 num_states_connected = fst->NumStates(); + KALDI_ASSERT(num_states_connected == num_states); + // arc-sort. ilabel or olabel doesn't matter, it's an acceptor. + fst::ArcSort(fst, fst::ILabelCompare()); + KALDI_LOG << "Created phone language model with " << num_states << " states."; +} + +} // namespace chain +} // namespace kaldi + + diff --git a/src/chain/language-model.h b/src/chain/language-model.h new file mode 100644 index 00000000000..b2c3f4cd746 --- /dev/null +++ b/src/chain/language-model.h @@ -0,0 +1,269 @@ +// chain/language-model.h + +// Copyright 2015 Johns Hopkins University (Author: Daniel Povey) + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + + +#ifndef KALDI_CHAIN_LANGUAGE_MODEL_H_ +#define KALDI_CHAIN_LANGUAGE_MODEL_H_ + +#include +#include + +#include "base/kaldi-common.h" +#include "util/common-utils.h" +#include "fstext/fstext-lib.h" +#include "lat/kaldi-lattice.h" + +namespace kaldi { + + +namespace chain { + +// Options for phone language model estimation. This is similar to an +// un-smoothed language model of a certain order (e.g. triphone). We won't be +// actually decoding with this, we'll just use it as the 'denominator graph' in +// acoustic model estimation. The reason for avoiding smoothing is to reduce +// the number of transitions in the language model, which will improve +// efficiency of training. + +struct LanguageModelOptions { + int32 ngram_order; // you might want to tune this + int32 num_extra_lm_states; // you also might want to tune this + int32 no_prune_ngram_order; // e.g. set this to 3 and it won't prune the + // trigram contexts (note: a trigram + // history-state has 2 known left phones)... this + // tends to make for a more compact graph (since + // the context FST anyway expands to trigram). + + LanguageModelOptions(): + ngram_order(4), + num_extra_lm_states(1000), + no_prune_ngram_order(3) { } + + void Register(OptionsItf *opts) { + opts->Register("ngram-order", &ngram_order, "n-gram order for the phone " + "language model used for the 'denominator model'"); + opts->Register("num-extra-lm-states", &num_extra_lm_states, "Number of LM " + "states desired on top of the nubmer determined by the " + "--no-prune-ngram-order option."); + opts->Register("no-prune-ngram-order", &no_prune_ngram_order, "n-gram order " + "below which the language model is not pruned (should " + "probably be set the same as your --context-width for phone " + "context in tree building, to make the graph as compact as " + "possible)"); + } +}; + +/** + This LanguageModelEstimator class estimates an n-gram language model + with a kind of 'hard' backoff that is intended to reduce the number of + arcs in the final compiled FST. Basically, we never back off to the lower-order + n-gram state, but we sometimes do just say, "this state's count is too small + so we won't have this state at all", and this LM state disappears and + transitions to it go to the lower-order n-gram's state. + + This language model is implemented as a set of states, and transitions + between these states; there is no concept of a backoff transition here. + Because this maps very naturally to an FST, we output it as an FST. + */ +class LanguageModelEstimator { + public: + LanguageModelEstimator(LanguageModelOptions &opts): opts_(opts), + num_active_lm_states_(0) { + KALDI_ASSERT(opts.ngram_order >= 1 && opts.no_prune_ngram_order >= 1); + } + + // Adds counts for this sentence. Basically does: for each n-gram in the + // sentence, count[n-gram] += 1. The only constraint on 'sentence' is that it + // should contain no zeros. + void AddCounts(const std::vector &sentence); + + // Estimates the LM and outputs it as an FST. Note: there is + // no concept here of backoff arcs. + void Estimate(fst::StdVectorFst *fst); + + protected: + struct LmState { + // the phone history associated with this state (length can vary). + std::vector history; + // maps from + std::map phone_to_count; + // total count of this state. As we back off states to lower-order states + // (and note that this is a hard backoff where we completely remove un-needed + // states) this tot_count may become zero. + int32 tot_count; + + // total count of this state plus all states that back off to this state. + // only valid after SetParentCounts() is called. + int32 tot_count_with_parents; + + // LM-state index of the backoff LM state (if it exists, else -1)... + // provided for convenience. The backoff state exist if and only + // if history.size() >= no_prune_ngram_order + int32 backoff_lmstate_index; + + // keeps track of the number of other LmStates 'other' for whom + // (other.tot_count > 0 or other.num_parents > 0) and + // other.backoff_lmstate_index is the index of this LM state. + // This lets us know whether this state has a chance, in the future, + // of getting a nonzero count, which in turn is used in the + // BackoffAllowed() function. + int32 num_parents; + + // this is only set after we decide on the FST state numbering (at the end). + // If not set, it's -1. + int32 fst_state; + + // True if backoff of this state is allowed (which implies it's in the queue). + // Backoff of this state is allowed (i.e. we will consider removing this state) + // if its history length is >= opts.no_prune_ngram_order, and it has nonzero + // count, and + bool backoff_allowed; + + void AddCount(int32 phone, int32 count); + + // Log-likelihood of data in this case, summed, not averaged: + // i.e. sum(phone in phones) count(phone) * log-prob(phone | this state). + BaseFloat LogLike() const; + // Add the contents of another LmState. + void Add(const LmState &other); + // Clear all counts from this state. + void Clear(); + LmState(): tot_count(0), tot_count_with_parents(0), backoff_lmstate_index(-1), + fst_state(-1), backoff_allowed(false) { } + LmState(const LmState &other): + history(other.history), phone_to_count(other.phone_to_count), + tot_count(other.tot_count), tot_count_with_parents(other.tot_count_with_parents), + backoff_lmstate_index(other.backoff_lmstate_index), + fst_state(other.fst_state), backoff_allowed(other.backoff_allowed) { } + }; + + // maps from history to int32 + typedef unordered_map, int32, VectorHasher > MapType; + + LanguageModelOptions opts_; + + MapType hist_to_lmstate_index_; + std::vector lm_states_; // indexed by lmstate_index, the LmStates. + + // Keeps track of the number of lm states that have nonzero counts. + int32 num_active_lm_states_; + + // The number of LM states that we would have due to the + // no_prune_ngram_order_. Equals the number of history-states of length + // no_prune_ngram_order_ - 1. Used to compute the total number of desired + // state (by adding opts_.num_extra_lm_states). + int32 num_basic_lm_states_; + + // Queue of pairs: (likelihood change [which is negative], lm_state_index). + // We always pick the one with the highest (least negative) likelihood change + // to merge. Note: elements in the queue can get out of date, so it's + // necessary to check that something is up-to-date (i.e. the likelihood change + // is accurate) before backing off a state. + // Note: after InitializeQueue() is called, any state that has nonzero count + // and history-length >= no_prune_ngram_order, will be in the queue. + // + // This whole algorithm is slightly approximate (i.e. it may not always back + // off the absolutely lowest-cost states), because we don't force + // recomputation of all the costs each time we back something off. Generally + // speaking, these costs will only increase as we back off more states, so the + // approximation is not such a big deal. + std::priority_queue > queue_; + + + // adds the counts for this ngram (called from AddCounts()). + inline void IncrementCount(const std::vector &history, + int32 next_phone); + + + // Computes whether backoff should be allowed for this lm_state. (the caller + // can set the backoff_allowed variable to match). Backoff is allowed if the + // history length is >= opts_.no_prune_ngram_order, and tot_count == + // tot_count_with_parents (i.e. there are no parents that are not yet backed + // off), and the total count is nonzero, and all transitions from this state + // involve backoff. (i.e. backoff is disallowed if the the history-state + // (this history-state + next-phone) exists. + bool BackoffAllowed(int32 lm_state) const; + + // sets up tot_count_with_parents in all the lm-states + void SetParentCounts(); + + // Computes the change, in log-likelihood caused by backing off this lm state + // to its backoff state, i.e. combining its counts with those of its backoff + // state. This lm state must have backoff_allowed set to true. This function + // returns what can be interpreted as a negated cost. As a special case, if + // the backoff state has a zero count but this state has a nonzero count, we + // set the like-change to -1e-15 * (count of this state). Before the backoff + // states have any counts, this encourages the lowest-count states to get + // backed-off first. + BaseFloat BackoffLogLikelihoodChange(int32 lmstate_index) const; + + // Adds to the queue, all LmStates that have nonzero count and history-length is + // >= no_prune_ngram_order. + void InitializeQueue(); + + // does the logic of pruning/backing-off states. + void DoBackoff(); + + // This function, will back off the counts of this lm_state to its + // backoff state, and update num_active_lm_states_ as appropriate. + // If the count of the backoff state was previously zero, and the backoff + // state's history-length is >= no_prune_ngram_order, the backoff + // state will get added to the queue. + void BackOffState(int32 lm_state); + + // Check, that num_active_lm_states_ is accurate, and returns + // the number of 'basic' LM-states (i.e. the number of lm-states whose history + // is of length no_prune_ngram_order - 1). + int32 CheckActiveStates() const; + + // Finds and returns an LM-state index for a history -- or -1 if it doesn't + // exist. No backoff is done. + int32 FindLmStateIndexForHistory(const std::vector &hist) const; + + // Finds and returns an LM-state index for a history -- and creates one if + // it doesn't exist -- and also creates any backoff states needed, down + // to history-length no_prune_ngram_order - 1. + int32 FindOrCreateLmStateIndexForHistory(const std::vector &hist); + + // Finds and returns the most specific LM-state index for a history or + // backed-off versions of it, that exists and has nonzero count. Will die if + // there is no such history. [e.g. if there is no unigram backoff state, + // which generally speaking there won't be.] + int32 FindNonzeroLmStateIndexForHistory(std::vector hist) const; + + // after all backoff has been done, assigns FST state indexes to all states + // that exist and have nonzero count. Returns the number of states. + int32 AssignFstStates(); + + // find the FST index of the initial-state, and returns it. + int32 FindInitialFstState() const; + + void OutputToFst( + int32 num_fst_states, + fst::StdVectorFst *fst) const; + +}; + + + +} // namespace chain +} // namespace kaldi + +#endif + diff --git a/src/chain/phone-context.h b/src/chain/phone-context.h new file mode 100644 index 00000000000..bfcb56e64d1 --- /dev/null +++ b/src/chain/phone-context.h @@ -0,0 +1,188 @@ +// chain/phone-context.h + +// Copyright 2015 Johns Hopkins University (Author: Daniel Povey) + + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + + +#ifndef KALDI_CHAIN_PHONE_CONTEXT_H_ +#define KALDI_CHAIN_PHONE_CONTEXT_H_ + +#include +#include + +#include "base/kaldi-common.h" +#include "util/common-utils.h" +#include "fstext/fstext-lib.h" + +namespace kaldi { +namespace chain { + + +/** + The 'PhoneContext' object is responsible for mapping phones in left-context to + cd-phones (context-dependent phones). In the 'chain' models, we only support + left-context, in order to make phone-level discriminative training + sufficiently efficient. The 'PhoneContext' model represents all the + information we need to know about the phonetic-context decision tree (so after + building the decision tree, we can build the PhoneContext object and then + discard the tree). + + There two types of cd-phones: cd-phones, and physical cd-phones. The logical + ones can be mapped to physical. The logical cd-phones are the ones that we + actually put in the graph, which will enable us to work out the phone sequence + (assuming the topology is 'alignable', which it normally will be). Logical + cd-phones are mappable to the (mono) phone; the physical ones are less + detailed, and can't necessarily be mapped to the monophones. + + Note that the PhoneTopology and PhoneContext will be incorporated as data + members in the ContextDependentTopology model, which contains information + about topology and context, and also controls the allocation of output-ids + (which are indexes into the neural net output, and roughly correspond to + context-dependent states in a conventional HMM-based system). +*/ + +class PhoneContext: public fst::DeterministicOnDemandFst { + public: + /* First, members that relate to the base class. */ + + // repeat the typedefs (they're not inherited automatically; we could inherit + // but they are boilerplate so we just repeat them). + typedef typename fst::StdArc Arc; + typedef typename Arc::StateId StateId; // should be int32. + typedef typename Arc::Weight Weight; + typedef typename Arc::Label Label; // should be int32. + + // The following are part of the interface from DeterministicOnDemandFst. + virtual StateId Start() { return 0; } + + // all states are final. + virtual Weight Final(StateId s) { return Weight::One(); } + + // Assuming 0 <= s < NumStates() and 1 <= phone <= NumPhones(), + // this function will return true and output to Arc as follows: + // ilabel = phone, olabel = logical-cd-phone, weight = One(), + // nextstate = [the next state after seeing this phone.] + virtual bool GetArc(StateId s, Label phone, Arc *oarc) = 0; + + // There is a concept of states in this model, whereby when it outputs a phone + // it advances the state. So it's an FST-like representation of the decision + // tree. States are numbered from 0 to NumStates() - 1. This function is + // actually not in the interface, but it is the same as in ExpandedFst. + int32 NumStates() const { return transitions_.size(); } + + virtual ~PhoneContext(); + + /* Next members not relating to the base class. */ + + PhoneContext(); + + // Initialization from a tree (which must be left-context only, i.e. + // CentralPosition() == ContextWidth() - 1). The initialization method relies + // on enumerating all possible contexts, so it will be slow if you have a + // ridiculously large context. + + // Note: we hope not to use this, we will use a separate version of the + // tree-building code that tries to reduce the number of 'context states'. + PhoneContext(int32 num_phones, const ContextDependencyInterface &ctx_dep); + + // Phones are numbered from 1 to NumPhones(). + int32 NumPhones() const { return num_phones_; } + + + // Return the number of distinct labels on the topology FST for this phone: + // the labels must be contiguously numbered from 1, so this is the same as + // the largest topology label. + bool GetNumLabels(int32 phone) const; + + // Logical context-dependent phones are numbered from 1 to + // NumLogicalCdPhones(). + int32 NumLogicalCdPhones() const { return logical_to_phone_.size() - 1; } + + // Physical context-dependent phones are numbered from 1 to + // NumPhysicalCdPhones(). + int32 NumPhysicalCdPhones() const { return num_physical_cd_phones_; } + + // This function tells you how many phones of left-context the underlying + // decision tree was built with: 0 for monophone, 1 for left-biphone, etc. It + // amounts to an assertion that if you take a given phone sequence of length + // LeftContext(), and starting from any FST state, use that phone-sequence as + // ilabels, you'll always end up in the same state. + int32 LeftContext() const { return left_context_; } + + // Maps a logical CD-phone to the phone index (i.e. of the monophone with + // no context)-- you cannot map to a full context, that is not what + // logical CD-phones mean in this code. + int32 LogicalToPhone(int32 logical_cd_phone) const; + + // Maps a logical CD-phone to a physical CD-phone. + int32 LogicalToPhysical(int32 logical_cd_phone) const; + + // Given a context-dependent phone index, return the set of phones it may + // correspond to (in most cases this would be a set of just one element). + // We'll implement this when we need it- it will require storing derived + // variables, to make it efficient. + // + // void CdPhoneToPhones(int32 cd_phone, std::vector *phones); + + + void Write(std::ostream &os, bool binary) const; + + void Read(std::istream &is); + + // Outputs to 'output' an FST that's a copy of this object in the normal FST + // format (as opposed to DeterministicOnDemandFst). This is the 'C' FST + // (the context-dependency FST) in the HCLG recipe. + // ilabels are phones, olabels are cd-phones. Note: can be implemented by + // taking an FST 'f' with one state that's initial and final, with self-loops + // for each phone, and then calling ComposeDeterministicOnDemand(f, *this, + // output). + void GetAsFst(fst::VectorFst* output) const; + private: + void Check(); + // Sets up the cd_phone_to_phone_ array. + void ComputeCdPhoneToPhone(); + + int32 num_phones_; + int32 num_physical_cd_phones_; + int32 left_context_; + + // 'transitions_' is indexed by state, then by phone - 1 (each vector of pairs + // is of length num_phones), and each pair is (cd-phone-index, next-state). + // For instance (bear in mind that 0 is the initial-state that you get at the + // begining of a phone_sequence), transitions_[0][p].first is the + // logical-cd-phone you get from seeing phone p with the left-context being the + // beginning of a sequence (i.e. a left-context of all zeros, as far as the + // tree is concerned); and transitions_[0][p].second is the context state you + // go to after seeing that phone. + std::vector > > transitions_; + + // map logical CD-phones to phones. Indexed by logical CD-phone (zeroth + // element not used). + std::vector logical_to_phone_; + + // map logical CD-phones to physical CD-phones. Indexed by logical CD-phone (zeroth + // element not used). + std::vector logical_to_physical_; + +}; + + +} // namespace chain +} // namespace kaldi + +#endif // KALDI_CHAIN_PHONE_CONTEXT_H_ diff --git a/src/chain/phone-topology.cc b/src/chain/phone-topology.cc new file mode 100644 index 00000000000..e0a3fb639b7 --- /dev/null +++ b/src/chain/phone-topology.cc @@ -0,0 +1,98 @@ +// chain/phone-topology.cc + +// Copyright 2015 Johns Hopkins University (author: Daniel Povey) +// 2015 Xingyu Na + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + +#include "chain/phone-topology.h" + +namespace kaldi { +namespace chain { + + +const fst::VectorFst& PhoneTopolgy::TopologyForPhone (int32 phone) { + return fsts_[phone]; +} + +PhoneTopology::PhoneTopology (int32 num_phones) { + fsts_.clear(); + fsts_.resize(num_phones + 1); + for (int32 i = 1; i <= num_phones; i++) { + fst::VectorFst fst; + fst.AddState(); // state 0 + fst.SetStart(0); // set start state + fst.AddState(); // state 1 + fst.AddArc(0, StdArc(1, 1, 0.5, 1)); + fst.AddArc(1, StdArc(2, 2, 0.5, 1)); + fst.SetFinal(1); // set final state + fsts_[i] = fst; + } +} + +void PhoneTopology::Write(std::ostream &os, bool binary) const{ + WriteToken(os, binary, ""); + if (!binary) os << "\n"; + int num_phones = fsts_.size() - 1; + WriteToken(os, binary, ""); + WriteBasicType(os, binary, num_phones); + if (!binary) os << "\n"; + std::vector >::iterator fiter = fsts_.begin(), + fend = fsts_.end(); + for (++fiter; fiter != fend; ++fiter) + WriteFstKaldi(os, binary, *fiter); + WriteToken(os, binary, ""); +} + +void PhoneTopology::Read(std::istream &is, bool binary) const{ + ExpectToken(is, binary, ""); + int num_phones; + ExpectToken(is, binary, ""); + ReadBasicType(is, binary, &num_phones); + fsts_.resize(num_phones + 1); + std::vector >::iterator fiter = fsts_.begin(), + fend = fsts_.end(); + for (++fiter; fiter != fend; ++fiter) + ReadFstKaldi(os, binary, fiter); + ExpectToken(is, binary, ""); +} + +bool PhonoTopology::IsAlignable() { + std::vector >::iterator fiter = fsts_.begin(), + fend = fsts_.end(); + for (++fiter; fiter != fend; ++fiter) { + // Get start state symbles + unordered_set syms; + for (ArcIterator >aiter(*fiter, fiter->Start()); !aiter.Done(); aiter.Next()) { + const Arc &arc = aiter.Value(); + syms.insert(arc.ilabel); + } + for (StateIterator siter(*fiter); !siter.Done(); siter.Next()) { + typename Arc::StateId s = siter.Value(); + for (ArcIterator >aiter(*fiter, s); !aiter.Done(); aiter.Next()) { + const Arc &arc = aiter.Value(); + if (arc.nextstate == fiter->Start()) + return false; + if (s != fiter->Start() && syms.find(arc.ilabel) != syms.end()) + return false; + } + } + } + return true; +} + +} // namespace chain +} // namespace kaldi diff --git a/src/chain/phone-topology.h b/src/chain/phone-topology.h new file mode 100644 index 00000000000..cec7e28686d --- /dev/null +++ b/src/chain/phone-topology.h @@ -0,0 +1,99 @@ +// chain/phone-topology.h + +// Copyright 2015 Johns Hopkins University (Author: Daniel Povey) + + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + + +#ifndef KALDI_CHAIN_PHONE_TOPOLOGY_H_ +#define KALDI_CHAIN_PHONE_TOPOLOGY_H_ + +#include +#include + +#include "base/kaldi-common.h" +#include "util/common-utils.h" +#include "fstext/fstext-lib.h" + +namespace kaldi { +namespace chain { + + +/** + The 'PhoneTopology' object stores the topology for each of the phones that the + system handles. This is the equivalent of a HMM topology, except that the + emission probabilities are on the arcs not the states (so it's much more + FST-like), and there are no transition probabilities (these are just folded + into the emission probabilities). Note that it's the fact that the 'chain' + system is trained discriminatively from the start is what enables us to treat + the transition probabilities this way. + + A topology is an epsilon-free finite state acceptor. The + 'normal' topology that you get if you don't do anything special, is as + follows: + +0 1 1 # transition from state 0 to state 1 with label 1. +1 1 2 # transition from state 1 to state 1 (self-loop) with label 2. +1 0 # this says that state 1 is final. + + The FSTs have the following properties: + - they are epsilon free + - the start state is numbered zero. + - the start state is not final. + - all states are used. + - the symbols on the labels of the FST start from 1 and are contiguous (no + unused symbols between the smallest and largest symbol). + + + Phones are given indexes from 1 to NumPhones() (no gaps are allowed here). + + A topology for a phone is an FST + */ + +class PhoneTopology { + public: + int32 NumPhones() { returns static_cast(fsts_.size()) - 1; } + + // Returns the topology for a given phone. + const fst::VectorFst &TopologyForPhone(int32 phone); + + // This constructor gives the phones the default topology. If you want to + // give it a different topology, then you can create the text-form of this + // object using a script. + PhoneTopology(int32 num_phones); + + void Write(std::ostream &os, bool binary) const; + + void Read(std::istream &is, bool binary) const; + + // returns true if all the phones' FSTs have the following properties: + // - the symbols on arcs out of the start-state are disjoint from the + // symbols on arcs out of other states. + // - there are no arcs ending in the start state. + bool IsAlignable(); + private: + void Check(); + + // index zero is not used. + std::vector > fsts_; +}; + + +} // namespace chain +} // namespace kaldi + +#endif // KALDI_CHAIN_PHONE_TOPOLOGY_H_ diff --git a/src/chainbin/Makefile b/src/chainbin/Makefile new file mode 100644 index 00000000000..3fdd7fdb4d0 --- /dev/null +++ b/src/chainbin/Makefile @@ -0,0 +1,30 @@ + +all: +EXTRA_CXXFLAGS = -Wno-sign-compare +include ../kaldi.mk + +LDFLAGS += $(CUDA_LDFLAGS) +LDLIBS += $(CUDA_LDLIBS) + +BINFILES = chain-est-phone-lm chain-get-supervision chain-make-den-fst \ + nnet3-chain-get-egs nnet3-chain-copy-egs nnet3-chain-merge-egs \ + nnet3-chain-shuffle-egs nnet3-chain-subset-egs \ + nnet3-chain-acc-lda-stats nnet3-chain-train nnet3-chain-compute-prob \ + nnet3-chain-combine nnet3-chain-normalize-egs + + +OBJFILES = + +# Add this dependency to force cuda-compiled.o to be rebuilt when we reconfigure. +cuda-compiled.o: ../kaldi.mk + +TESTFILES = + +ADDLIBS = ../nnet3/kaldi-nnet3.a ../chain/kaldi-chain.a ../gmm/kaldi-gmm.a \ + ../decoder/kaldi-decoder.a ../lat/kaldi-lat.a ../hmm/kaldi-hmm.a \ + ../transform/kaldi-transform.a ../tree/kaldi-tree.a \ + ../cudamatrix/kaldi-cudamatrix.a \ + ../matrix/kaldi-matrix.a ../fstext/kaldi-fstext.a \ + ../util/kaldi-util.a ../thread/kaldi-thread.a ../base/kaldi-base.a + +include ../makefiles/default_rules.mk diff --git a/src/chainbin/chain-est-phone-lm.cc b/src/chainbin/chain-est-phone-lm.cc new file mode 100644 index 00000000000..f16b3f4f14b --- /dev/null +++ b/src/chainbin/chain-est-phone-lm.cc @@ -0,0 +1,81 @@ +// chainbin/chain-est-phone-lm.cc + +// Copyright 2015 Johns Hopkins University (author: Daniel Povey) + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + +#include "base/kaldi-common.h" +#include "util/common-utils.h" +#include "chain/language-model.h" + + +int main(int argc, char *argv[]) { + try { + using namespace kaldi; + using namespace kaldi::chain; + typedef kaldi::int32 int32; + + const char *usage = + "Initialize un-smoothed phone language model for 'chain' training\n" + "Output in FST format (epsilon-free deterministic acceptor)\n" + "\n" + "Usage: chain-est-phone-lm [options] \n" + "The phone-sequences are used to train a language model.\n" + "e.g.:\n" + "gunzip -c input_dir/ali.*.gz | ali-to-phones input_dir/final.mdl ark:- ark:- | \\\n" + " chain-est-phone-lm --leftmost-context-questions=dir/leftmost_questions.txt ark:- dir/phone_G.fst\n"; + + bool binary_write = true; + LanguageModelOptions lm_opts; + + ParseOptions po(usage); + po.Register("binary", &binary_write, "Write output in binary mode"); + lm_opts.Register(&po); + + po.Read(argc, argv); + + if (po.NumArgs() != 2) { + po.PrintUsage(); + exit(1); + } + + std::string phone_seqs_rspecifier = po.GetArg(1), + lm_fst_wxfilename = po.GetArg(2); + + + LanguageModelEstimator lm_estimator(lm_opts); + + SequentialInt32VectorReader phones_reader(phone_seqs_rspecifier); + KALDI_LOG << "Reading phone sequences"; + for (; !phones_reader.Done(); phones_reader.Next()) { + const std::vector &phone_seq = phones_reader.Value(); + lm_estimator.AddCounts(phone_seq); + } + KALDI_LOG << "Estimating phone LM"; + fst::StdVectorFst fst; + lm_estimator.Estimate(&fst); + + WriteFstKaldi(fst, lm_fst_wxfilename); + + KALDI_LOG << "Estimated phone language model and wrote it to " + << lm_fst_wxfilename; + return 0; + } catch(const std::exception &e) { + std::cerr << e.what() << '\n'; + return -1; + } +} + diff --git a/src/chainbin/chain-get-supervision.cc b/src/chainbin/chain-get-supervision.cc new file mode 100644 index 00000000000..b05f1166da4 --- /dev/null +++ b/src/chainbin/chain-get-supervision.cc @@ -0,0 +1,151 @@ +// chainbin/chain-get-supervision.cc + +// Copyright 2015 Johns Hopkins University (author: Daniel Povey) + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + +#include + +#include "base/kaldi-common.h" +#include "util/common-utils.h" +#include "chain/chain-supervision.h" + +namespace kaldi { +namespace chain { + + +// This wrapper function does all the job of processing the features and +// lattice into ChainSupervision objects, and writing them out. +static bool ProcessSupervision(const TransitionModel &trans_model, + const ContextDependencyInterface &ctx_dep, + const ProtoSupervision &proto_sup, + const std::string &key, + SupervisionWriter *supervision_writer) { + Supervision supervision; + if (!ProtoSupervisionToSupervision(ctx_dep, trans_model, + proto_sup, &supervision)) { + KALDI_WARN << "Failed creating supervision for utterance " + << key; + return false; + } + if (RandInt(0, 10) == 0) + supervision.Check(trans_model); + + supervision_writer->Write(key, supervision); + return true; +} + + +} // namespace chain +} // namespace kaldi + +int main(int argc, char *argv[]) { + try { + using namespace kaldi; + using namespace kaldi::chain; + typedef kaldi::int32 int32; + typedef kaldi::int64 int64; + + const char *usage = + "Get a 'chain' supervision object for each file of training data.\n" + "This will normally be piped into nnet3-chain-get-egs, where it\n" + "will be split up into pieces and combined with the features.\n" + "Input can come in two formats: from alignments\n" + "(from ali-to-phones --write-lenghts=true), or from lattices\n" + "(e.g. derived from aligning the data, see steps/align_fmllr_lats.sh)\n" + "that have been converged to phone-level lattices with\n" + "lattice-align-phones --replace-output-symbols=true.\n" + "\n" + "Usage: chain-get-supervision [options] " + "[|] \n" + "See steps/nnet3/chain/get_egs.sh for example\n"; + + + bool lattice_input = false; + SupervisionOptions sup_opts; + + ParseOptions po(usage); + sup_opts.Register(&po); + po.Register("lattice-input", &lattice_input, "If true, expect phone " + "lattices as input"); + + po.Read(argc, argv); + + if (po.NumArgs() != 4) { + po.PrintUsage(); + exit(1); + } + + std::string tree_rxfilename = po.GetArg(1), + trans_model_rxfilename = po.GetArg(2), + phone_durs_or_lat_rspecifier = po.GetArg(3), + supervision_wspecifier = po.GetArg(4); + + TransitionModel trans_model; + ReadKaldiObject(trans_model_rxfilename, &trans_model); + + ContextDependency ctx_dep; + ReadKaldiObject(tree_rxfilename, &ctx_dep); + + SupervisionWriter supervision_writer(supervision_wspecifier); + + int32 num_utts_done = 0, num_utts_error = 0; + + if (lattice_input) { + SequentialCompactLatticeReader clat_reader(phone_durs_or_lat_rspecifier); + for (; !clat_reader.Done(); clat_reader.Next()) { + std::string key = clat_reader.Key(); + const CompactLattice &clat = clat_reader.Value(); + ProtoSupervision proto_supervision; + bool ans = PhoneLatticeToProtoSupervision(sup_opts, clat, + &proto_supervision); + if (!ans) { + KALDI_WARN << "Error creating proto-supervision for utterance " << key; + num_utts_error++; + continue; + } + if (ProcessSupervision(trans_model, ctx_dep, + proto_supervision, key, &supervision_writer)) + num_utts_done++; + else + num_utts_error++; + } + } else { + SequentialInt32PairVectorReader phone_and_dur_reader( + phone_durs_or_lat_rspecifier); + for (; !phone_and_dur_reader.Done(); phone_and_dur_reader.Next()) { + std::string key = phone_and_dur_reader.Key(); + const std::vector > &ali = + phone_and_dur_reader.Value(); + ProtoSupervision proto_supervision; + AlignmentToProtoSupervision(sup_opts, ali, + &proto_supervision); + if (ProcessSupervision(trans_model, ctx_dep, + proto_supervision, key, &supervision_writer)) + num_utts_done++; + else + num_utts_error++; + } + } + KALDI_LOG << "Generated chain supervision information for " + << num_utts_done << " utterances, errors on " + << num_utts_error; + return (num_utts_done > num_utts_error ? 0 : 1); + } catch(const std::exception &e) { + std::cerr << e.what() << '\n'; + return -1; + } +} diff --git a/src/chainbin/chain-make-den-fst.cc b/src/chainbin/chain-make-den-fst.cc new file mode 100644 index 00000000000..0d8d249242b --- /dev/null +++ b/src/chainbin/chain-make-den-fst.cc @@ -0,0 +1,86 @@ +// chainbin/chain-make-den-fst.cc + +// Copyright 2015 Johns Hopkins University (author: Daniel Povey) + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + +#include "base/kaldi-common.h" +#include "util/common-utils.h" +#include "chain/chain-den-graph.h" + + +int main(int argc, char *argv[]) { + try { + using namespace kaldi; + using namespace kaldi::chain; + typedef kaldi::int32 int32; + + const char *usage = + "Created 'denominator' FST for 'chain' training\n" + "Outputs in FST format. is an epsilon-free acceptor\n" + " is a modified version of (w.r.t.\n" + "initial and final probs) that is used in example generation.\n" + "\n" + "Usage: chain-make-den-fsth [options] " + " \n" + "e.g.:\n" + "chain-make-den-fst dir/tree dir/0.trans_mdl dir/phone_lm.fst dir/den.fst dir/normalization.fst\n"; + + ParseOptions po(usage); + + po.Read(argc, argv); + + if (po.NumArgs() != 5) { + po.PrintUsage(); + exit(1); + } + + std::string tree_rxfilename = po.GetArg(1), + transition_model_rxfilename = po.GetArg(2), + phone_lm_rxfilename = po.GetArg(3), + den_fst_wxfilename = po.GetArg(4), + normalization_fst_wxfilename = po.GetArg(5); + + + ContextDependency ctx_dep; + TransitionModel trans_model; + fst::StdVectorFst phone_lm; + + ReadKaldiObject(tree_rxfilename, &ctx_dep); + ReadKaldiObject(transition_model_rxfilename, &trans_model); + ReadFstKaldi(phone_lm_rxfilename, &phone_lm); + + fst::StdVectorFst den_fst; + chain::CreateDenominatorFst(ctx_dep, trans_model, phone_lm, + &den_fst); + + fst::StdVectorFst normalization_fst; + chain::DenominatorGraph den_graph(den_fst, trans_model.NumPdfs()); + den_graph.GetNormalizationFst(den_fst, &normalization_fst); + + + WriteFstKaldi(den_fst, den_fst_wxfilename); + WriteFstKaldi(normalization_fst, normalization_fst_wxfilename); + + KALDI_LOG << "Write denominator FST to " << den_fst_wxfilename + << " and normalization FST to " << normalization_fst_wxfilename; + return 0; + } catch(const std::exception &e) { + std::cerr << e.what() << '\n'; + return -1; + } +} + diff --git a/src/chainbin/nnet3-chain-acc-lda-stats.cc b/src/chainbin/nnet3-chain-acc-lda-stats.cc new file mode 100644 index 00000000000..3f092879b6e --- /dev/null +++ b/src/chainbin/nnet3-chain-acc-lda-stats.cc @@ -0,0 +1,206 @@ +// nnet3bin/nnet3-chain-acc-lda-stats.cc + +// Copyright 2015 Johns Hopkins University (author: Daniel Povey) + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + +#include "base/kaldi-common.h" +#include "util/common-utils.h" +#include "hmm/transition-model.h" +#include "lat/lattice-functions.h" +#include "nnet3/nnet-nnet.h" +#include "nnet3/nnet-chain-example.h" +#include "nnet3/nnet-compute.h" +#include "nnet3/nnet-optimize.h" +#include "transform/lda-estimate.h" + + +namespace kaldi { +namespace nnet3 { + +class NnetChainLdaStatsAccumulator { + public: + NnetChainLdaStatsAccumulator(BaseFloat rand_prune, + const Nnet &nnet): + rand_prune_(rand_prune), nnet_(nnet), compiler_(nnet) { } + + + void AccStats(const NnetChainExample &eg) { + ComputationRequest request; + bool need_backprop = false, store_stats = false, + need_xent = false, need_xent_deriv = false; + + GetChainComputationRequest(nnet_, eg, need_backprop, store_stats, + need_xent, need_xent_deriv, &request); + + const NnetComputation &computation = *(compiler_.Compile(request)); + + NnetComputeOptions options; + if (GetVerboseLevel() >= 3) + options.debug = true; + NnetComputer computer(options, computation, nnet_, NULL); + + computer.AcceptInputs(nnet_, eg.inputs); + computer.Forward(); + const CuMatrixBase &nnet_output = computer.GetOutput("output"); + AccStatsFromOutput(eg, nnet_output); + } + + void WriteStats(const std::string &stats_wxfilename, bool binary) { + if (lda_stats_.TotCount() == 0) { + KALDI_ERR << "Accumulated no stats."; + } else { + WriteKaldiObject(lda_stats_, stats_wxfilename, binary); + KALDI_LOG << "Accumulated stats, soft frame count = " + << lda_stats_.TotCount() << ". Wrote to " + << stats_wxfilename; + } + } + private: + void AccStatsFromOutput(const NnetChainExample &eg, + const CuMatrixBase &nnet_output) { + BaseFloat rand_prune = rand_prune_; + + if (eg.outputs.size() != 1 || eg.outputs[0].name != "output") + KALDI_ERR << "Expecting the example to have one output named 'output'."; + + + const chain::Supervision &supervision = eg.outputs[0].supervision; + // handling the one-sequence-per-eg case is easier so we just do that. + KALDI_ASSERT(supervision.num_sequences == 1 && + "This program expects one sequence per eg."); + int32 num_frames = supervision.frames_per_sequence, + num_pdfs = supervision.label_dim; + KALDI_ASSERT(num_frames == nnet_output.NumRows()); + const fst::StdVectorFst &fst = supervision.fst; + + Lattice lat; + // convert the FST to a lattice, putting all the weight on + // the graph weight. This is to save us having to implement the + // forward-backward on FSTs. + ConvertFstToLattice(fst, &lat); + Posterior post; + LatticeForwardBackward(lat, &post); + KALDI_ASSERT(post.size() == static_cast(num_frames)); + + // Subtract one, to convert the (pdf-id + 1) which appears in the + // supervision FST, to a pdf-id. + for (size_t i = 0; i < post.size(); i++) + for (size_t j = 0; j < post[i].size(); j++) + post[i][j].first--; + + if (lda_stats_.Dim() == 0) + lda_stats_.Init(num_pdfs, + nnet_output.NumCols()); + + for (int32 t = 0; t < num_frames; t++) { + // the following, transferring row by row to CPU, would be wasteful if we + // actually were using a GPU, but we don't anticipate using a GPU in this + // program. + CuSubVector cu_row(nnet_output, t); + // "row" is actually just a redudant copy, since we're likely on CPU, + // but we're about to do an outer product, so this doesn't dominate. + Vector row(cu_row); + + std::vector >::const_iterator + iter = post[t].begin(), end = post[t].end(); + + for (; iter != end; ++iter) { + int32 pdf = iter->first; + BaseFloat weight = iter->second; + BaseFloat pruned_weight = RandPrune(weight, rand_prune); + if (pruned_weight != 0.0) + lda_stats_.Accumulate(row, pdf, pruned_weight); + } + } + } + + BaseFloat rand_prune_; + const Nnet &nnet_; + CachingOptimizingCompiler compiler_; + LdaEstimate lda_stats_; +}; + +} +} + +int main(int argc, char *argv[]) { + try { + using namespace kaldi; + using namespace kaldi::nnet3; + typedef kaldi::int32 int32; + typedef kaldi::int64 int64; + + const char *usage = + "Accumulate statistics in the same format as acc-lda (i.e. stats for\n" + "estimation of LDA and similar types of transform), starting from nnet+chain\n" + "training examples. This program puts the features through the network,\n" + "and the network output will be the features; the supervision in the\n" + "training examples is used for the class labels. Used in obtaining\n" + "feature transforms that help nnet training work better.\n" + "Note: the time boundaries it gets from the chain supervision will be\n" + "a little fuzzy (which is not ideal), but it should not matter much in\n" + "this situation\n" + "\n" + "Usage: nnet3-chain-acc-lda-stats [options] \n" + "e.g.:\n" + "nnet3-chain-acc-lda-stats 0.raw ark:1.cegs 1.acc\n" + "See also: nnet-get-feature-transform\n"; + + bool binary_write = true; + BaseFloat rand_prune = 0.0; + + ParseOptions po(usage); + po.Register("binary", &binary_write, "Write output in binary mode"); + po.Register("rand-prune", &rand_prune, + "Randomized pruning threshold for posteriors"); + + po.Read(argc, argv); + + if (po.NumArgs() != 3) { + po.PrintUsage(); + exit(1); + } + + std::string nnet_rxfilename = po.GetArg(1), + examples_rspecifier = po.GetArg(2), + lda_accs_wxfilename = po.GetArg(3); + + // Note: this neural net is probably just splicing the features at this + // point. + Nnet nnet; + ReadKaldiObject(nnet_rxfilename, &nnet); + + NnetChainLdaStatsAccumulator accumulator(rand_prune, nnet); + + int64 num_egs = 0; + + SequentialNnetChainExampleReader example_reader(examples_rspecifier); + for (; !example_reader.Done(); example_reader.Next(), num_egs++) + accumulator.AccStats(example_reader.Value()); + + KALDI_LOG << "Processed " << num_egs << " examples."; + // the next command will die if we accumulated no stats. + accumulator.WriteStats(lda_accs_wxfilename, binary_write); + + return 0; + } catch(const std::exception &e) { + std::cerr << e.what() << '\n'; + return -1; + } +} + + diff --git a/src/chainbin/nnet3-chain-combine.cc b/src/chainbin/nnet3-chain-combine.cc new file mode 100644 index 00000000000..a7083c8332e --- /dev/null +++ b/src/chainbin/nnet3-chain-combine.cc @@ -0,0 +1,121 @@ +// chainbin/nnet3-chain-combine.cc + +// Copyright 2012-2015 Johns Hopkins University (author: Daniel Povey) + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + +#include "base/kaldi-common.h" +#include "util/common-utils.h" +#include "nnet3/nnet-chain-combine.h" + + +int main(int argc, char *argv[]) { + try { + using namespace kaldi; + using namespace kaldi::nnet3; + typedef kaldi::int32 int32; + typedef kaldi::int64 int64; + + const char *usage = + "Using a subset of training or held-out nnet3+chain examples, compute an\n" + "optimal combination of anumber of nnet3 neural nets by maximizing the\n" + "'chain' objective function. See documentation of options for more details.\n" + "Inputs and outputs are nnet3 raw nnets.\n" + "\n" + "Usage: nnet3-chain-combine [options] ... \n" + "\n" + "e.g.:\n" + " nnet3-combine den.fst 35.raw 36.raw 37.raw 38.raw ark:valid.cegs final.raw\n"; + + bool binary_write = true; + std::string use_gpu = "yes"; + NnetCombineConfig combine_config; + chain::ChainTrainingOptions chain_config; + + ParseOptions po(usage); + po.Register("binary", &binary_write, "Write output in binary mode"); + po.Register("use-gpu", &use_gpu, + "yes|no|optional|wait, only has effect if compiled with CUDA"); + + combine_config.Register(&po); + chain_config.Register(&po); + + po.Read(argc, argv); + + if (po.NumArgs() < 4) { + po.PrintUsage(); + exit(1); + } + +#if HAVE_CUDA==1 + CuDevice::Instantiate().SelectGpuId(use_gpu); +#endif + + std::string + den_fst_rxfilename = po.GetArg(1), + raw_nnet_rxfilename = po.GetArg(2), + valid_examples_rspecifier = po.GetArg(po.NumArgs() - 1), + nnet_wxfilename = po.GetArg(po.NumArgs()); + + + fst::StdVectorFst den_fst; + ReadFstKaldi(den_fst_rxfilename, &den_fst); + + Nnet nnet; + ReadKaldiObject(raw_nnet_rxfilename, &nnet); + + + std::vector egs; + egs.reserve(10000); // reserve a lot of space to minimize the chance of + // reallocation. + + { // This block adds training examples to "egs". + SequentialNnetChainExampleReader example_reader( + valid_examples_rspecifier); + for (; !example_reader.Done(); example_reader.Next()) + egs.push_back(example_reader.Value()); + KALDI_LOG << "Read " << egs.size() << " examples."; + KALDI_ASSERT(!egs.empty()); + } + + + int32 num_nnets = po.NumArgs() - 3; + NnetChainCombiner combiner(combine_config, chain_config, + num_nnets, egs, den_fst, nnet); + + for (int32 n = 1; n < num_nnets; n++) { + std::string this_nnet_rxfilename = po.GetArg(n + 2); + ReadKaldiObject(this_nnet_rxfilename, &nnet); + combiner.AcceptNnet(nnet); + } + + combiner.Combine(); + +#if HAVE_CUDA==1 + CuDevice::Instantiate().PrintProfile(); +#endif + + WriteKaldiObject(combiner.GetNnet(), nnet_wxfilename, binary_write); + + KALDI_LOG << "Finished combining neural nets, wrote model to " + << nnet_wxfilename; + } catch(const std::exception &e) { + std::cerr << e.what() << '\n'; + return -1; + } +} + + diff --git a/src/chainbin/nnet3-chain-compute-prob.cc b/src/chainbin/nnet3-chain-compute-prob.cc new file mode 100644 index 00000000000..7f9d688777a --- /dev/null +++ b/src/chainbin/nnet3-chain-compute-prob.cc @@ -0,0 +1,88 @@ +// nnet3bin/nnet3-chain-compute-prob.cc + +// Copyright 2015 Johns Hopkins University (author: Daniel Povey) + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + +#include "base/kaldi-common.h" +#include "util/common-utils.h" +#include "nnet3/nnet-chain-diagnostics.h" + + +int main(int argc, char *argv[]) { + try { + using namespace kaldi; + using namespace kaldi::nnet3; + typedef kaldi::int32 int32; + typedef kaldi::int64 int64; + + const char *usage = + "Computes and prints to in logging messages the average log-prob per frame of\n" + "the given data with an nnet3+chain neural net. The input of this is the output of\n" + "e.g. nnet3-chain-get-egs | nnet3-chain-merge-egs.\n" + "\n" + "Usage: nnet3-chain-compute-prob [options] \n" + "e.g.: nnet3-chain-compute-prob 0.mdl den.fst ark:valid.egs\n"; + + + // This program doesn't support using a GPU, because these probabilities are + // used for diagnostics, and you can just compute them with a small enough + // amount of data that a CPU can do it within reasonable time. + // It wouldn't be hard to make it support GPU, though. + + NnetComputeProbOptions nnet_opts; + chain::ChainTrainingOptions chain_opts; + + ParseOptions po(usage); + + nnet_opts.Register(&po); + chain_opts.Register(&po); + + po.Read(argc, argv); + + if (po.NumArgs() != 3) { + po.PrintUsage(); + exit(1); + } + + std::string nnet_rxfilename = po.GetArg(1), + den_fst_rxfilename = po.GetArg(2), + examples_rspecifier = po.GetArg(3); + + Nnet nnet; + ReadKaldiObject(nnet_rxfilename, &nnet); + + fst::StdVectorFst den_fst; + ReadFstKaldi(den_fst_rxfilename, &den_fst); + + NnetChainComputeProb chain_prob_computer(nnet_opts, chain_opts, den_fst, + nnet); + + SequentialNnetChainExampleReader example_reader(examples_rspecifier); + + for (; !example_reader.Done(); example_reader.Next()) + chain_prob_computer.Compute(example_reader.Value()); + + bool ok = chain_prob_computer.PrintTotalStats(); + + return (ok ? 0 : 1); + } catch(const std::exception &e) { + std::cerr << e.what() << '\n'; + return -1; + } +} + + diff --git a/src/chainbin/nnet3-chain-copy-egs.cc b/src/chainbin/nnet3-chain-copy-egs.cc new file mode 100644 index 00000000000..5404cdc438e --- /dev/null +++ b/src/chainbin/nnet3-chain-copy-egs.cc @@ -0,0 +1,140 @@ +// chainbin/nnet3-chain-copy-egs.cc + +// Copyright 2012-2015 Johns Hopkins University (author: Daniel Povey) +// 2014 Vimal Manohar + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + +#include "base/kaldi-common.h" +#include "util/common-utils.h" +#include "hmm/transition-model.h" +#include "nnet3/nnet-chain-example.h" + +namespace kaldi { +// returns an integer randomly drawn with expected value "expected_count" +// (will be either floor(expected_count) or ceil(expected_count)). +int32 GetCount(double expected_count) { + KALDI_ASSERT(expected_count >= 0.0); + int32 ans = floor(expected_count); + expected_count -= ans; + if (WithProb(expected_count)) + ans++; + return ans; +} +} + +int main(int argc, char *argv[]) { + try { + using namespace kaldi; + using namespace kaldi::nnet3; + typedef kaldi::int32 int32; + typedef kaldi::int64 int64; + + const char *usage = + "Copy examples for nnet3+chain network training, possibly changing the binary mode.\n" + "Supports multiple wspecifiers, in which case it will write the examples\n" + "round-robin to the outputs.\n" + "\n" + "Usage: nnet3-chain-copy-egs [options] [ ...]\n" + "\n" + "e.g.\n" + "nnet3-chain-copy-egs ark:train.cegs ark,t:text.cegs\n" + "or:\n" + "nnet3-chain-copy-egs ark:train.cegs ark:1.cegs ark:2.cegs\n"; + + bool random = false; + int32 srand_seed = 0; + int32 frame_shift = 0; + int32 truncate_deriv_weights = 0; + BaseFloat keep_proportion = 1.0; + + ParseOptions po(usage); + po.Register("random", &random, "If true, will write frames to output " + "archives randomly, not round-robin."); + po.Register("keep-proportion", &keep_proportion, "If <1.0, this program will " + "randomly keep this proportion of the input samples. If >1.0, it will " + "in expectation copy a sample this many times. It will copy it a number " + "of times equal to floor(keep-proportion) or ceil(keep-proportion)."); + po.Register("srand", &srand_seed, "Seed for random number generator " + "(only relevant if --random=true or --keep-proportion != 1.0)"); + po.Register("frame-shift", &frame_shift, "Allows you to shift time values " + "in the supervision data (excluding iVector data) - useful in " + "augmenting data. Note, the outputs will remain at the closest " + "exact multiples of the frame subsampling factor"); + po.Register("truncate-deriv-weights", &truncate_deriv_weights, + "If nonzero, the number of initial/final subsample frames that " + "will have their derivatives' weights set to zero."); + + po.Read(argc, argv); + + srand(srand_seed); + + if (po.NumArgs() < 2) { + po.PrintUsage(); + exit(1); + } + + std::string examples_rspecifier = po.GetArg(1); + + SequentialNnetChainExampleReader example_reader(examples_rspecifier); + + int32 num_outputs = po.NumArgs() - 1; + std::vector example_writers(num_outputs); + for (int32 i = 0; i < num_outputs; i++) + example_writers[i] = new NnetChainExampleWriter(po.GetArg(i+2)); + + std::vector exclude_names; // names we never shift times of; + // not configurable for now. + exclude_names.push_back(std::string("ivector")); + + + int64 num_read = 0, num_written = 0; + for (; !example_reader.Done(); example_reader.Next(), num_read++) { + // count is normally 1; could be 0, or possibly >1. + int32 count = GetCount(keep_proportion); + std::string key = example_reader.Key(); + if (frame_shift == 0 && truncate_deriv_weights == 0) { + const NnetChainExample &eg = example_reader.Value(); + for (int32 c = 0; c < count; c++) { + int32 index = (random ? Rand() : num_written) % num_outputs; + example_writers[index]->Write(key, eg); + num_written++; + } + } else if (count > 0) { + NnetChainExample eg = example_reader.Value(); + if (frame_shift != 0) + ShiftChainExampleTimes(frame_shift, exclude_names, &eg); + if (truncate_deriv_weights != 0) + TruncateDerivWeights(truncate_deriv_weights, &eg); + for (int32 c = 0; c < count; c++) { + int32 index = (random ? Rand() : num_written) % num_outputs; + example_writers[index]->Write(key, eg); + num_written++; + } + } + } + for (int32 i = 0; i < num_outputs; i++) + delete example_writers[i]; + KALDI_LOG << "Read " << num_read + << " neural-network training examples, wrote " << num_written; + return (num_written == 0 ? 1 : 0); + } catch(const std::exception &e) { + std::cerr << e.what() << '\n'; + return -1; + } +} + + diff --git a/src/chainbin/nnet3-chain-get-egs.cc b/src/chainbin/nnet3-chain-get-egs.cc new file mode 100644 index 00000000000..321b18ed122 --- /dev/null +++ b/src/chainbin/nnet3-chain-get-egs.cc @@ -0,0 +1,372 @@ +// chainbin/nnet3-chain-get-egs.cc + +// Copyright 2015 Johns Hopkins University (author: Daniel Povey) + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + +#include + +#include "base/kaldi-common.h" +#include "util/common-utils.h" +#include "hmm/transition-model.h" +#include "hmm/posterior.h" +#include "nnet3/nnet-example.h" +#include "nnet3/nnet-chain-example.h" +#include "nnet3/nnet-example-utils.h" + +namespace kaldi { +namespace nnet3 { + + +/** + This function does all the processing for one utterance, and outputs the + supervision objects to 'example_writer'. Note: if normalization_fst is the + empty FST (with no states), it skips the final stage of egs preparation and + you should do it later with nnet3-chain-normalize-egs. +*/ + +static bool ProcessFile(const fst::StdVectorFst &normalization_fst, + const MatrixBase &feats, + const MatrixBase *ivector_feats, + const chain::Supervision &supervision, + const std::string &utt_id, + bool compress, + int32 left_context, + int32 right_context, + int32 frames_per_eg, + int32 frames_overlap_per_eg, + int32 frame_subsampling_factor, + int32 cut_zero_frames, + int64 *num_frames_written, + int64 *num_egs_written, + NnetChainExampleWriter *example_writer) { + KALDI_ASSERT(supervision.num_sequences == 1); + int32 num_feature_frames = feats.NumRows(), + num_output_frames = supervision.frames_per_sequence, + num_feature_frames_subsampled = + (num_feature_frames + frame_subsampling_factor - 1)/ + frame_subsampling_factor; + if (num_output_frames != num_feature_frames_subsampled) { + // we tolerate deviations in the num-frames if they are very small (1 output + // frame). + + if (abs(num_output_frames - num_feature_frames_subsampled) > 1) { + KALDI_ERR << "Mismatch in num-frames: chain supervision has " + << num_output_frames + << " versus features/frame_subsampling_factor = " + << num_feature_frames << " / " << frame_subsampling_factor + << " = " << num_feature_frames_subsampled + << ": check that --frame-subsampling-factor option is set " + << "the same as to chain-get-supervision."; + } + int32 new_num_feature_frames = + num_output_frames * frame_subsampling_factor; + // add a few frames at the end to make it match up. + Matrix feats_new(new_num_feature_frames, feats.NumCols(), + kUndefined); + int32 min_feature_frames = std::min(num_feature_frames, + new_num_feature_frames); + feats_new.RowRange(0, min_feature_frames).CopyFromMat( + feats.RowRange(0, min_feature_frames)); + for (int32 i = num_feature_frames; i < new_num_feature_frames; i++) + feats_new.Row(i).CopyFromVec(feats.Row(num_feature_frames - 1)); + return ProcessFile(normalization_fst, feats_new, ivector_feats, + supervision, utt_id, compress, left_context, right_context, + frames_per_eg, frames_overlap_per_eg, frame_subsampling_factor, + cut_zero_frames, num_frames_written, num_egs_written, + example_writer); + } + + KALDI_ASSERT(frames_per_eg % frame_subsampling_factor == 0); + + int32 frames_per_eg_subsampled = frames_per_eg / frame_subsampling_factor, + frames_overlap_subsampled = frames_overlap_per_eg / frame_subsampling_factor, + frames_shift_subsampled = frames_per_eg_subsampled - frames_overlap_subsampled; + + if (num_feature_frames_subsampled < frames_per_eg_subsampled) { + KALDI_WARN << "Length of features for utterance " << utt_id + << " is less than than the frames_per_eg (after sub-sampling)."; + return false; + } + + // we don't do any padding, as it would be a bit tricky to pad the 'chain' supervision. + // Instead we select ranges of frames that fully fit within the file; these + // might slightly overlap with each other or have gaps. + std::vector range_starts_subsampled; + chain::SplitIntoRanges(num_feature_frames_subsampled - + frames_overlap_subsampled, + frames_shift_subsampled, + &range_starts_subsampled); + // The 'deriv_weights' make sure we don't count frames twice, and also ensure + // that we tend to avoid having nonzero weights on the derivatives that are + // too close to the edge of the corresponding 'range' (these derivatives close + // to the edge are not as accurate as they could be, because when we split we + // don't know the correct alphas and betas). + std::vector > deriv_weights; + if (cut_zero_frames >= 0) + chain::GetWeightsForRangesNew(frames_per_eg_subsampled, + cut_zero_frames / frame_subsampling_factor, + range_starts_subsampled, + &deriv_weights); + else + chain::GetWeightsForRanges(frames_per_eg_subsampled, + range_starts_subsampled, + &deriv_weights); + + if (range_starts_subsampled.empty()) { + KALDI_WARN << "No output for utterance " << utt_id + << " (num-frames=" << num_feature_frames + << ") because too short for --frames-per-eg=" + << frames_per_eg; + return false; + } + chain::SupervisionSplitter splitter(supervision); + + for (size_t i = 0; i < range_starts_subsampled.size(); i++) { + int32 range_start_subsampled = range_starts_subsampled[i], + range_start = range_start_subsampled * frame_subsampling_factor; + + chain::Supervision supervision_part; + splitter.GetFrameRange(range_start_subsampled, + frames_per_eg_subsampled, + &supervision_part); + + if (normalization_fst.NumStates() > 0 && + !AddWeightToSupervisionFst(normalization_fst, + &supervision_part)) { + KALDI_WARN << "For utterance " << utt_id << ", frames " + << range_start << " to " << (range_start + frames_per_eg) + << ", FST was empty after composing with normalization FST. " + << "This should be extremely rare (a few per corpus, at most)"; + return false; + } + + int32 first_frame = 0; // we shift the time-indexes of all these parts so + // that the supervised part starts from frame 0. + NnetChainSupervision nnet_supervision("output", supervision_part, + deriv_weights[i], + first_frame, frame_subsampling_factor); + + NnetChainExample nnet_chain_eg; + nnet_chain_eg.outputs.resize(1); + nnet_chain_eg.outputs[0].Swap(&nnet_supervision); + nnet_chain_eg.inputs.resize(ivector_feats != NULL ? 2 : 1); + + int32 tot_frames = left_context + frames_per_eg + right_context; + Matrix input_frames(tot_frames, feats.NumCols(), kUndefined); + + // Set up "input_frames". + for (int32 j = -left_context; j < frames_per_eg + right_context; j++) { + int32 t = range_start + j; + if (t < 0) t = 0; + if (t >= feats.NumRows()) t = feats.NumRows() - 1; + SubVector src(feats, t), + dest(input_frames, j + left_context); + dest.CopyFromVec(src); + } + NnetIo input_io("input", - left_context, + input_frames); + nnet_chain_eg.inputs[0].Swap(&input_io); + + if (ivector_feats != NULL) { + // if applicable, add the iVector feature. + // try to get closest frame to middle of window to get + // a representative iVector. + int32 closest_frame = range_start + frames_per_eg / 2; + KALDI_ASSERT(ivector_feats->NumRows() > 0); + if (closest_frame >= ivector_feats->NumRows()) + closest_frame = ivector_feats->NumRows() - 1; + Matrix ivector(1, ivector_feats->NumCols()); + ivector.Row(0).CopyFromVec(ivector_feats->Row(closest_frame)); + NnetIo ivector_io("ivector", 0, ivector); + nnet_chain_eg.inputs[1].Swap(&ivector_io); + } + + if (compress) + nnet_chain_eg.Compress(); + + std::ostringstream os; + os << utt_id << "-" << range_start; + + std::string key = os.str(); // key is - + + *num_frames_written += frames_per_eg; + *num_egs_written += 1; + + example_writer->Write(key, nnet_chain_eg); + } + return true; +} + +} // namespace nnet2 +} // namespace kaldi + +int main(int argc, char *argv[]) { + try { + using namespace kaldi; + using namespace kaldi::nnet3; + typedef kaldi::int32 int32; + typedef kaldi::int64 int64; + + const char *usage = + "Get frame-by-frame examples of data for nnet3+chain neural network\n" + "training. This involves breaking up utterances into pieces of a\n" + "fixed size. Input will come from chain-get-supervision.\n" + "Note: if is not supplied the egs will not be\n" + "ready for training; in that case they should later be processed\n" + "with nnet3-chain-normalize-egs\n" + "\n" + "Usage: nnet3-chain-get-egs [options] [] " + " \n" + "\n" + "An example [where $feats expands to the actual features]:\n" + "chain-get-supervision [args] | \\\n" + " nnet3-chain-get-egs --left-context=25 --right-context=9 --num-frames=20 dir/normalization.fst \\\n" + " \"$feats\" ark,s,cs:- ark:cegs.1.ark\n" + "Note: the --frame-subsampling-factor option must be the same as given to\n" + "chain-get-supervision.\n"; + + bool compress = true; + int32 left_context = 0, right_context = 0, num_frames = 1, + num_frames_overlap = 0, length_tolerance = 100, + cut_zero_frames = -1, + frame_subsampling_factor = 1; + + std::string ivector_rspecifier; + + ParseOptions po(usage); + po.Register("compress", &compress, "If true, write egs in " + "compressed format (recommended)"); + po.Register("cut-zero-frames", &cut_zero_frames, "Number of frames " + "(measured before subsampling) to zero the derivative on each " + "side of a cut point (if set, activates new-style derivative " + "weights)"); + po.Register("left-context", &left_context, "Number of frames of left " + "context the neural net requires."); + po.Register("right-context", &right_context, "Number of frames of right " + "context the neural net requires."); + po.Register("num-frames", &num_frames, "Number of frames with labels " + "that each example contains. Will be rounded up to a multiple " + "of --frame-subsampling-factor."); + po.Register("num-frames-overlap", &num_frames_overlap, "Number of frames of " + "overlap between each example (could be useful in conjunction " + "--min-deriv-time and --max-deriv-time, to avoid wasting data). " + "Each time we shift by --num-frames minus --num-frames-overlap."); + po.Register("ivectors", &ivector_rspecifier, "Rspecifier of ivector " + "features, as a matrix."); + po.Register("length-tolerance", &length_tolerance, "Tolerance for " + "difference in num-frames between feat and ivector matrices"); + po.Register("frame-subsampling-factor", &frame_subsampling_factor, "Used " + "if the frame-rate at the output will be less than the " + "frame-rate of the input"); + + po.Read(argc, argv); + + if (po.NumArgs() < 3 || po.NumArgs() > 4) { + po.PrintUsage(); + exit(1); + } + + if (num_frames <= 0 || left_context < 0 || right_context < 0 || + length_tolerance < 0 || frame_subsampling_factor <= 0) + KALDI_ERR << "One of the integer options is out of the allowed range."; + RoundUpNumFrames(frame_subsampling_factor, + &num_frames, &num_frames_overlap); + + std::string + normalization_fst_rxfilename, + feature_rspecifier, + supervision_rspecifier, + examples_wspecifier; + if (po.NumArgs() == 3) { + feature_rspecifier = po.GetArg(1); + supervision_rspecifier = po.GetArg(2); + examples_wspecifier = po.GetArg(3); + } else { + normalization_fst_rxfilename = po.GetArg(1); + KALDI_ASSERT(!normalization_fst_rxfilename.empty()); + feature_rspecifier = po.GetArg(2); + supervision_rspecifier = po.GetArg(3); + examples_wspecifier = po.GetArg(4); + } + + fst::StdVectorFst normalization_fst; + if (!normalization_fst_rxfilename.empty()) { + ReadFstKaldi(normalization_fst_rxfilename, &normalization_fst); + KALDI_ASSERT(normalization_fst.NumStates() > 0); + } + + SequentialBaseFloatMatrixReader feat_reader(feature_rspecifier); + chain::RandomAccessSupervisionReader supervision_reader( + supervision_rspecifier); + NnetChainExampleWriter example_writer(examples_wspecifier); + RandomAccessBaseFloatMatrixReader ivector_reader(ivector_rspecifier); + + int32 num_done = 0, num_err = 0; + int64 num_frames_written = 0, num_egs_written = 0; + + for (; !feat_reader.Done(); feat_reader.Next()) { + std::string key = feat_reader.Key(); + const Matrix &feats = feat_reader.Value(); + if (!supervision_reader.HasKey(key)) { + KALDI_WARN << "No pdf-level posterior for key " << key; + num_err++; + } else { + const chain::Supervision &supervision = supervision_reader.Value(key); + const Matrix *ivector_feats = NULL; + if (!ivector_rspecifier.empty()) { + if (!ivector_reader.HasKey(key)) { + KALDI_WARN << "No iVectors for utterance " << key; + num_err++; + continue; + } else { + // this address will be valid until we call HasKey() or Value() + // again. + ivector_feats = &(ivector_reader.Value(key)); + } + } + if (ivector_feats != NULL && + (abs(feats.NumRows() - ivector_feats->NumRows()) > length_tolerance + || ivector_feats->NumRows() == 0)) { + KALDI_WARN << "Length difference between feats " << feats.NumRows() + << " and iVectors " << ivector_feats->NumRows() + << " exceeds tolerance " << length_tolerance; + num_err++; + continue; + } + if (ProcessFile(normalization_fst, feats, ivector_feats, supervision, + key, compress, + left_context, right_context, num_frames, + num_frames_overlap, frame_subsampling_factor, + cut_zero_frames, &num_frames_written, &num_egs_written, + &example_writer)) + num_done++; + else + num_err++; + } + } + + KALDI_LOG << "Finished generating nnet3-chain examples, " + << "successfully processed " << num_done + << " feature files, wrote " << num_egs_written << " examples, " + << " with " << num_frames_written << " frames in total; " + << num_err << " files had errors."; + return (num_egs_written == 0 || num_err > num_done ? 1 : 0); + } catch(const std::exception &e) { + std::cerr << e.what() << '\n'; + return -1; + } +} diff --git a/src/chainbin/nnet3-chain-merge-egs.cc b/src/chainbin/nnet3-chain-merge-egs.cc new file mode 100644 index 00000000000..45dca4051f3 --- /dev/null +++ b/src/chainbin/nnet3-chain-merge-egs.cc @@ -0,0 +1,101 @@ +// chainbin/nnet3-chain-merge-egs.cc + +// Copyright 2012-2015 Johns Hopkins University (author: Daniel Povey) +// 2014 Vimal Manohar + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + +#include "base/kaldi-common.h" +#include "util/common-utils.h" +#include "hmm/transition-model.h" +#include "nnet3/nnet-chain-example.h" + + +int main(int argc, char *argv[]) { + try { + using namespace kaldi; + using namespace kaldi::nnet3; + typedef kaldi::int32 int32; + typedef kaldi::int64 int64; + + const char *usage = + "This copies nnet3+chain training examples from input to output, merging them\n" + "into composite examples. The --minibatch-size option controls how many egs\n" + "are merged into a single output eg.\n" + "\n" + "Usage: nnet3-chain-merge-egs [options] \n" + "e.g.\n" + "nnet3-chain-merge-egs --minibatch-size=128 ark:1.cegs ark:- | nnet3-chain-train-simple ... \n" + "See also nnet3-chain-copy-egs\n"; + + bool compress = false; + int32 minibatch_size = 64; + + ParseOptions po(usage); + po.Register("minibatch-size", &minibatch_size, "Target size of minibatches " + "when merging (see also --measure-output-frames)"); + po.Register("compress", &compress, "If true, compress the output examples " + "(not recommended unless you are writing to disk"); + + po.Read(argc, argv); + + if (po.NumArgs() != 2) { + po.PrintUsage(); + exit(1); + } + + std::string examples_rspecifier = po.GetArg(1), + examples_wspecifier = po.GetArg(2); + + SequentialNnetChainExampleReader example_reader(examples_rspecifier); + NnetChainExampleWriter example_writer(examples_wspecifier); + + std::vector examples; + examples.reserve(minibatch_size); + + int64 num_read = 0, num_written = 0; + while (!example_reader.Done()) { + const NnetChainExample &cur_eg = example_reader.Value(); + examples.resize(examples.size() + 1); + examples.back() = cur_eg; + + bool minibatch_ready = + static_cast(examples.size()) >= minibatch_size; + + // Do Next() now, so we can test example_reader.Done() below . + example_reader.Next(); + num_read++; + + if (minibatch_ready || (example_reader.Done() && !examples.empty())) { + NnetChainExample merged_eg; + MergeChainExamples(compress, &examples, &merged_eg); + std::ostringstream ostr; + ostr << "merged-" << num_written; + num_written++; + std::string output_key = ostr.str(); + example_writer.Write(output_key, merged_eg); + examples.clear(); + } + } + KALDI_LOG << "Merged " << num_read << " egs to " << num_written << '.'; + return (num_written != 0 ? 0 : 1); + } catch(const std::exception &e) { + std::cerr << e.what() << '\n'; + return -1; + } +} + + diff --git a/src/chainbin/nnet3-chain-normalize-egs.cc b/src/chainbin/nnet3-chain-normalize-egs.cc new file mode 100644 index 00000000000..9d3f56f756a --- /dev/null +++ b/src/chainbin/nnet3-chain-normalize-egs.cc @@ -0,0 +1,91 @@ +// chainbin/nnet3-chain-normalize-egs.cc + +// Copyright 2015 Johns Hopkins University (author: Daniel Povey) + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + +#include "base/kaldi-common.h" +#include "util/common-utils.h" +#include "hmm/transition-model.h" +#include "nnet3/nnet-chain-example.h" +#include "chain/chain-supervision.h" + +int main(int argc, char *argv[]) { + try { + using namespace kaldi; + using namespace kaldi::nnet3; + typedef kaldi::int32 int32; + typedef kaldi::int64 int64; + + const char *usage = + "Add weights from 'normalization' FST to nnet3+chain examples.\n" + "Should be done if and only if the argument of\n" + "nnet3-chain-get-egs was not supplied when the original egs were\n" + "created.\n" + "\n" + "Usage: nnet3-chain-normalize-egs [options] \n" + "\n" + "e.g.\n" + "nnet3-chain-normalize-egs dir/normalization.fst ark:train_in.cegs ark:train_out.cegs\n"; + + ParseOptions po(usage); + + po.Read(argc, argv); + + if (po.NumArgs() != 3) { + po.PrintUsage(); + exit(1); + } + + std::string normalization_fst_rxfilename = po.GetArg(1), + examples_rspecifier = po.GetArg(2), + examples_wspecifier = po.GetArg(3); + + fst::StdVectorFst normalization_fst; + ReadFstKaldi(normalization_fst_rxfilename, &normalization_fst); + + SequentialNnetChainExampleReader example_reader(examples_rspecifier); + NnetChainExampleWriter example_writer(examples_wspecifier); + + int64 num_written = 0, num_err = 0;; + for (; !example_reader.Done(); example_reader.Next()) { + std::string key = example_reader.Key(); + NnetChainExample eg = example_reader.Value(); + + if (eg.outputs.size() != 1) + KALDI_ERR << "Expected example to have exactly one output."; + if (!AddWeightToSupervisionFst(normalization_fst, + &(eg.outputs[0].supervision))) { + KALDI_WARN << "For example " << key + << ", FST was empty after composing with normalization FST. " + << "This should be extremely rare (a few per corpus, at most)"; + num_err++; + } else { + example_writer.Write(key, eg); + num_written++; + } + } + + KALDI_LOG << "Added normalization to " << num_written + << " egs; had errors on " << num_err; + return (num_written == 0 ? 1 : 0); + } catch(const std::exception &e) { + std::cerr << e.what() << '\n'; + return -1; + } +} + + diff --git a/src/chainbin/nnet3-chain-shuffle-egs.cc b/src/chainbin/nnet3-chain-shuffle-egs.cc new file mode 100644 index 00000000000..7ab6e28f607 --- /dev/null +++ b/src/chainbin/nnet3-chain-shuffle-egs.cc @@ -0,0 +1,115 @@ +// chainbin/nnet3-chain-shuffle-egs.cc + +// Copyright 2012-2015 Johns Hopkins University (author: Daniel Povey) +// 2014 Vimal Manohar + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + +#include "base/kaldi-common.h" +#include "util/common-utils.h" +#include "hmm/transition-model.h" +#include "nnet3/nnet-chain-example.h" + +int main(int argc, char *argv[]) { + try { + using namespace kaldi; + using namespace kaldi::nnet3; + typedef kaldi::int32 int32; + typedef kaldi::int64 int64; + + const char *usage = + "Copy nnet3+chain examples for neural network training, from the input to output,\n" + "while randomly shuffling the order. This program will keep all of the examples\n" + "in memory at once, unless you use the --buffer-size option\n" + "\n" + "Usage: nnet3-chain-shuffle-egs [options] \n" + "\n" + "nnet3-chain-shuffle-egs --srand=1 ark:train.egs ark:shuffled.egs\n"; + + int32 srand_seed = 0; + int32 buffer_size = 0; + ParseOptions po(usage); + po.Register("srand", &srand_seed, "Seed for random number generator "); + po.Register("buffer-size", &buffer_size, "If >0, size of a buffer we use " + "to do limited-memory partial randomization. Otherwise, do " + "full randomization."); + + po.Read(argc, argv); + + srand(srand_seed); + + if (po.NumArgs() != 2) { + po.PrintUsage(); + exit(1); + } + + std::string examples_rspecifier = po.GetArg(1), + examples_wspecifier = po.GetArg(2); + + int64 num_done = 0; + + std::vector > egs; + + SequentialNnetChainExampleReader example_reader(examples_rspecifier); + NnetChainExampleWriter example_writer(examples_wspecifier); + if (buffer_size == 0) { // Do full randomization + // Putting in an extra level of indirection here to avoid excessive + // computation and memory demands when we have to resize the vector. + + for (; !example_reader.Done(); example_reader.Next()) + egs.push_back(std::pair( + example_reader.Key(), + new NnetChainExample(example_reader.Value()))); + + std::random_shuffle(egs.begin(), egs.end()); + } else { + KALDI_ASSERT(buffer_size > 0); + egs.resize(buffer_size, + std::pair("", NULL)); + for (; !example_reader.Done(); example_reader.Next()) { + int32 index = RandInt(0, buffer_size - 1); + if (egs[index].second == NULL) { + egs[index] = std::pair( + example_reader.Key(), + new NnetChainExample(example_reader.Value())); + } else { + example_writer.Write(egs[index].first, *(egs[index].second)); + egs[index].first = example_reader.Key(); + *(egs[index].second) = example_reader.Value(); + num_done++; + } + } + } + for (size_t i = 0; i < egs.size(); i++) { + if (egs[i].second != NULL) { + example_writer.Write(egs[i].first, *(egs[i].second)); + delete egs[i].second; + num_done++; + } + } + + KALDI_LOG << "Shuffled order of " << num_done + << " neural-network training examples " + << (buffer_size ? "using a buffer (partial randomization)" : ""); + + return (num_done == 0 ? 1 : 0); + } catch(const std::exception &e) { + std::cerr << e.what() << '\n'; + return -1; + } +} + + diff --git a/src/chainbin/nnet3-chain-subset-egs.cc b/src/chainbin/nnet3-chain-subset-egs.cc new file mode 100644 index 00000000000..0206003ab13 --- /dev/null +++ b/src/chainbin/nnet3-chain-subset-egs.cc @@ -0,0 +1,101 @@ +// chainbin/nnet3-chain-subset-egs.cc + +// Copyright 2012-2015 Johns Hopkins University (author: Daniel Povey) +// 2014 Vimal Manohar + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + +#include "base/kaldi-common.h" +#include "util/common-utils.h" +#include "nnet3/nnet-chain-example.h" + +int main(int argc, char *argv[]) { + try { + using namespace kaldi; + using namespace kaldi::nnet3; + typedef kaldi::int32 int32; + typedef kaldi::int64 int64; + + const char *usage = + "Creates a random subset of the input nnet3+chain examples, of a specified size.\n" + "Uses no more memory than the size of the subset.\n" + "\n" + "Usage: nnet3-chain-cubset-egs [options] [ ...]\n" + "\n" + "e.g.\n" + "nnet3-chain-subset-egs [args] ark:- | nnet-subset-egs --n=1000 ark:- ark:subset.cegs\n"; + + int32 srand_seed = 0; + int32 n = 1000; + bool randomize_order = true; + ParseOptions po(usage); + po.Register("srand", &srand_seed, "Seed for random number generator "); + po.Register("n", &n, "Number of examples to output"); + po.Register("randomize-order", &randomize_order, "If true, randomize the order " + "of the output"); + + po.Read(argc, argv); + + srand(srand_seed); + + if (po.NumArgs() != 2) { + po.PrintUsage(); + exit(1); + } + + std::string examples_rspecifier = po.GetArg(1), + examples_wspecifier = po.GetArg(2); + + std::vector > egs; + egs.reserve(n); + + SequentialNnetChainExampleReader example_reader(examples_rspecifier); + + int64 num_read = 0; + for (; !example_reader.Done(); example_reader.Next()) { + num_read++; + if (num_read <= n) { + egs.resize(egs.size() + 1); + egs.back().first = example_reader.Key(); + egs.back().second = example_reader.Value(); + } else { + BaseFloat keep_prob = n / static_cast(num_read); + if (WithProb(keep_prob)) { // With probability "keep_prob" + int32 index = RandInt(0, n-1); + egs[index].first = example_reader.Key(); + egs[index].second = example_reader.Value(); + } + } + } + if (randomize_order) + std::random_shuffle(egs.begin(), egs.end()); + + NnetChainExampleWriter writer(examples_wspecifier); + for (size_t i = 0; i < egs.size(); i++) { + writer.Write(egs[i].first, egs[i].second); + } + + KALDI_LOG << "Selected a subset of " << egs.size() << " out of " << num_read + << " nnet3+chain training examples "; + + return (num_read != 0 ? 0 : 1); + } catch(const std::exception &e) { + std::cerr << e.what() << '\n'; + return -1; + } +} + + diff --git a/src/chainbin/nnet3-chain-train.cc b/src/chainbin/nnet3-chain-train.cc new file mode 100644 index 00000000000..5486a5f7fe9 --- /dev/null +++ b/src/chainbin/nnet3-chain-train.cc @@ -0,0 +1,99 @@ +// nnet3bin/nnet3-chain-train.cc + +// Copyright 2015 Johns Hopkins University (author: Daniel Povey) + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + +#include "base/kaldi-common.h" +#include "util/common-utils.h" +#include "nnet3/nnet-chain-training.h" + + +int main(int argc, char *argv[]) { + try { + using namespace kaldi; + using namespace kaldi::nnet3; + using namespace kaldi::chain; + typedef kaldi::int32 int32; + typedef kaldi::int64 int64; + + const char *usage = + "Train nnet3+chain neural network parameters with backprop and stochastic\n" + "gradient descent. Minibatches are to be created by nnet3-chain-merge-egs in\n" + "the input pipeline. This training program is single-threaded (best to\n" + "use it with a GPU).\n" + "\n" + "Usage: nnet3-chain-train [options] \n" + "\n" + "nnet3-chain-train 1.raw den.fst 'ark:nnet3-merge-egs 1.cegs ark:-|' 2.raw\n"; + + bool binary_write = true; + std::string use_gpu = "yes"; + NnetChainTrainingOptions opts; + + ParseOptions po(usage); + po.Register("binary", &binary_write, "Write output in binary mode"); + po.Register("use-gpu", &use_gpu, + "yes|no|optional|wait, only has effect if compiled with CUDA"); + + opts.Register(&po); + + po.Read(argc, argv); + + if (po.NumArgs() != 4) { + po.PrintUsage(); + exit(1); + } + +#if HAVE_CUDA==1 + CuDevice::Instantiate().SelectGpuId(use_gpu); +#endif + + std::string nnet_rxfilename = po.GetArg(1), + den_fst_rxfilename = po.GetArg(2), + examples_rspecifier = po.GetArg(3), + nnet_wxfilename = po.GetArg(4); + + Nnet nnet; + ReadKaldiObject(nnet_rxfilename, &nnet); + + bool ok; + + { + fst::StdVectorFst den_fst; + ReadFstKaldi(den_fst_rxfilename, &den_fst); + + NnetChainTrainer trainer(opts, den_fst, &nnet); + + SequentialNnetChainExampleReader example_reader(examples_rspecifier); + + for (; !example_reader.Done(); example_reader.Next()) + trainer.Train(example_reader.Value()); + + ok = trainer.PrintTotalStats(); + } + +#if HAVE_CUDA==1 + CuDevice::Instantiate().PrintProfile(); +#endif + WriteKaldiObject(nnet, nnet_wxfilename, binary_write); + KALDI_LOG << "Wrote raw model to " << nnet_wxfilename; + return (ok ? 0 : 1); + } catch(const std::exception &e) { + std::cerr << e.what() << '\n'; + return -1; + } +} diff --git a/src/configure b/src/configure index a758dbeb50b..d8675e0925a 100755 --- a/src/configure +++ b/src/configure @@ -13,7 +13,9 @@ # ./configure --shared ## shared libraries. # ./configure --mkl-root=/opt/intel/mkl # ./configure --mkl-root=/opt/intel/mkl --threaded-math=yes -# ./configure --openblas-root=../tools/OpenBLAS/install # before doing +# ./configure --mkl-root=/opt/intel/mkl --threaded-math=yes --mkl-threading=tbb +# This is for MKL 11.3 -- which does not seem to provide Intel OMP libs +# ./configure --openblas-root=../tools/OpenBLAS/install # before doing # # this, cd to ../tools and type "make openblas". Note: # # this is not working correctly on all platforms, do "make test" # # and look out for segmentation faults. @@ -21,7 +23,7 @@ #This should be incremented after every significant change of the configure script #I.e. after each change that affects the kaldi.mk or the build system as whole -CONFIGURE_VERSION=2 +CONFIGURE_VERSION=3 function rel2abs { if [ ! -z "$1" ]; then @@ -39,7 +41,7 @@ function read_dirname { function is_set { local myvar=${1:-notset} - if [ "$myvar" == "notset" ]; then + if [ "$myvar" == "notset" ]; then return 1 else return 0 @@ -50,7 +52,7 @@ function is_set { ## First do some checks. These verify that all the things are ## here that should be here. -if [ "`basename $PWD`" != "src" ]; then +if ! [ -x "$PWD/configure" ]; then echo 'You must run "configure" from the src/ directory.' exit 1 fi @@ -67,10 +69,10 @@ unset OPENBLASROOT unset MKLLIBDIR function usage { - echo 'Usage: ./configure [--static|--shared] [--threaded-atlas={yes|no}] [--atlas-root=ATLASROOT] [--fst-root=FSTROOT] + echo 'Usage: ./configure [--static|--shared] [--threaded-atlas={yes|no}] [--atlas-root=ATLASROOT] [--fst-root=FSTROOT] [--openblas-root=OPENBLASROOOT] [--clapack-root=CLAPACKROOT] [--mkl-root=MKLROOT] [--mkl-libdir=MKLLIBDIR] - [--omp-libdir=OMPDIR] [--static-fst={yes|no}] [--static-math={yes|no}] [--threaded-math={yes|no}] [--mathlib=ATLAS|MKL|CLAPACK|OPENBLAS] - [--use-cuda={yes|no}] [--cudatk-dir=CUDATKDIR]'; + [--omp-libdir=OMPDIR] [--static-fst={yes|no}] [--static-math={yes|no}] [--threaded-math={yes|no}] [--mathlib=ATLAS|MKL|CLAPACK|OPENBLAS] + [--use-cuda={yes|no}] [--cudatk-dir=CUDATKDIR][--mkl-threading=sequential|iomp|tbb|gomp]'; } threaded_atlas=false # By default, use the un-threaded version of ATLAS. @@ -79,68 +81,115 @@ static_math=false static_fst=false use_cuda=true dynamic_kaldi=false +mkl_threading=sequential cmd_line="$0 $@" # Save the command line to include in kaldi.mk while [ $# -gt 0 ]; do case "$1" in - --help) usage; exit 0 ;; - --version) echo $CONFIGURE_VERSION; exit 0 ;; + --help) + usage; exit 0 ;; + --version) + echo $CONFIGURE_VERSION; exit 0 ;; --static) - # FIXME depends on order of options first--static/--shared then --static-math -> it will override it - dynamic_kaldi=false; - static_math=true; - static_fst=true; - shift ;; + dynamic_kaldi=false; + static_math=true; + static_fst=true; + shift ;; --shared) - dynamic_kaldi=true; - static_math=false; - static_fst=false; - shift ;; + dynamic_kaldi=true; + static_math=false; + static_fst=false; + shift ;; --atlas-root=*) - ATLASROOT=`read_dirname $1`; shift ;; + ATLASROOT=`read_dirname $1`; + shift ;; --threaded-atlas=yes) - threaded_atlas=true; shift ;; + threaded_atlas=true; + shift ;; --threaded-atlas=no) - threaded_atlas=false; shift ;; + threaded_atlas=false; + shift ;; --threaded-math=yes) - threaded_atlas=true; threaded_math=true; shift ;; + threaded_atlas=true; + threaded_math=true; + mkl_threading=iomp + shift ;; --threaded-math=no) - threaded_atlas=false; threaded_math=false; shift ;; + threaded_atlas=false; + threaded_math=false; + mkl_threading=sequential + shift ;; --use-cuda=yes) - use_cuda=true; shift ;; + use_cuda=true; + shift ;; --use-cuda=no) - use_cuda=false; shift ;; + use_cuda=false; + shift ;; --static-math=yes) - static_math=true; shift ;; + static_math=true; + shift ;; --static-math=no) - static_math=false; shift ;; + static_math=false; + shift ;; --static-fst=yes) - static_fst=true; shift ;; + static_fst=true; + shift ;; --static-fst=no) - static_fst=false; shift ;; + static_fst=false; + shift ;; + --mkl-threading=sequential) + threaded_atlas=false; + threaded_math=false; + mkl_threading=sequential; + shift ;; + --mkl-threading=*) + mkl_threading=`expr "X$1" : '[^=]*=\(.*\)'`; + threaded_atlas=true; + threaded_math=true; + shift ;; --fst-root=*) - FSTROOT=`read_dirname $1`; shift ;; + FSTROOT=`read_dirname $1`; + shift ;; --clapack-root=*) - CLAPACKROOT=`read_dirname $1`; shift ;; + CLAPACKROOT=`read_dirname $1`; + shift ;; --openblas-root=*) - OPENBLASROOT=`read_dirname $1`; shift ;; + OPENBLASROOT=`read_dirname $1`; + shift ;; --mkl-root=*) - MKLROOT=`read_dirname $1`; shift ;; + MKLROOT=`read_dirname $1`; + shift ;; --mkl-libdir=*) - MKLLIBDIR=`read_dirname $1`; shift ;; + MKLLIBDIR=`read_dirname $1`; + shift ;; + --speex-root=*) + SPEEXROOT=`read_dirname $1`; + shift ;; + --speex-libdir=*) + SPEEXLIBDIR=`read_dirname $1`; + shift ;; + --speex-includedir=*) + SPEEXINCLUDEDIR=`read_dirname $1`; + shift ;; --omp-libdir=*) - OMPLIBDIR=`read_dirname $1`; shift ;; + OMPLIBDIR=`read_dirname $1`; + shift ;; --mathlib=*) - MATHLIB=`expr "X$1" : '[^=]*=\(.*\)'`; shift ;; + MATHLIB=`expr "X$1" : '[^=]*=\(.*\)'`; + shift ;; --cudatk-dir=*) - CUDATKDIR=`read_dirname $1`; shift ;; #CUDA is used in src/cudamatrix and src/nnet{,bin} only + CUDATKDIR=`read_dirname $1`; + shift ;; #CUDA is used in src/cudamatrix and src/nnet{,bin} only *) echo "Unknown argument: $1, exiting"; usage; exit 1 ;; esac done - +# the idea here is that if you change the configuration options from using +# CUDA to not using it, or vice versa, we want to recompile all parts of the +# code that may use a GPU. Touching this file is a way to force this. +touch cudamatrix/cu-common.h 2>/dev/null function failure { echo "***configure failed: $* ***" >&2 @@ -178,6 +227,7 @@ function check_for_slow_expf { cd .. } + function exit_success { check_for_bad_gcc; check_for_slow_expf; @@ -186,6 +236,27 @@ function exit_success { } + +function check_library { + local libpath=$1 + local libname=$2 + local libext=$3 + local full_libname="$libpath/$libname.$libext" + ##echo "Testing $full_libname" >&2 + test -f "$full_libname" && return ; + return 1 +} + + + +#Check if at least one of these variables is set +#If yes, we want to switch to using the MKL +is_set $MKLLIBDIR && echo "Force-configuring KALDI to use MKL" && export MATHLIB="MKL" +is_set $MKLROOT && echo "Force-configuring KALDI to use MKL"&& export MATHLIB="MKL" +is_set $CLAPACKROOT && echo "Force-configuring KALDI to use CLAPACK"&& export MATHLIB="CLAPACK" +is_set $OPENBLASROOT && echo "Force-configuring KALDI to use OPENBLAS"&& export MATHLIB="OPENBLAS" + +#MKL functions function linux_configure_mkllibdir { local mklroot=$1 @@ -198,120 +269,156 @@ function linux_configure_mkllibdir { fi } +function linux_configure_mkl_includes { + test -d $1/include && echo "$1/include" && return; + test -d $2/../../include && echo "$2/../../include" && return; + failure "Could not find the MKL include directory" +} + + function linux_configure_mkl_libraries { local mkllibdir=$1 local static=$2 local threaded=$3 + local mplib=$4 + + declare -A mkl_libs + mkl_libs=( + [sequential]="mkl_intel_lp64 mkl_core mkl_sequential" + [gomp]="mkl_intel_lp64 mkl_core mkl_gnu_thread" + [iomp]="mkl_intel_lp64 mkl_core mkl_intel_thread " + [tbb]="mkl_intel_lp64 mkl_core mkl_tbb_thread " + ) + + if [ -z "${mkl_libs[$threaded]}" ]; then + echo >&2 "Unknown threading mode: $threaded" + return 1; + fi - #these lines were generated using the Intel Link Line Advisor 2.2 - local threaded_libs="mkl_intel_lp64 mkl_intel_thread mkl_core" - local sequential_libs="mkl_intel_lp64 mkl_sequential mkl_core" - - if ! $static && $threaded ; then - for file in $threaded_libs; do - local libfile=$mkllibdir/lib$file.so - check_exists $libfile - done - echo "-L$mkllibdir -Wl,-rpath=$mkllibdir -lmkl_intel_lp64 -lmkl_intel_thread -lmkl_core -liomp5 -lpthread -lm" - elif ! $static && ! $threaded ; then - for file in $sequential_libs; do + local linkline="" + if ! $static ; then + linkline="-L$mkllibdir -Wl,-rpath=$mkllibdir" + for file in ${mkl_libs[$threaded]}; do local libfile=$mkllibdir/lib$file.so check_exists $libfile - done - echo "-L$mkllibdir -Wl,-rpath=$mkllibdir -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -liomp5 -lpthread -lm" - elif $static && $threaded ; then - local linkline="" - test -f "$mkllibdir/libmkl_solver_lp64.a" && linkline="$linkline $mkllibdir/libmkl_solver_lp64.a" - linkline="$linkline -Wl,--start-group" - for file in $threaded_libs; do - local libfile=$mkllibdir/lib$file.a - check_exists $libfile - linkline="$linkline $libfile" - done - #linkline="$linkline -Wl,--end-group -liomp5 -lpthread -lm " - linkline="$linkline -Wl,--end-group " - echo $linkline - elif $static && ! $threaded ; then - local linkline="" - test -f "$mkllibdir/libmkl_solver_lp64_sequential.a" && linkline="$linkline $mkllibdir/libmkl_solver_lp64_sequential.a" + linkline+=" -l$file " + done + else + if [ $threaded == "sequential" ] ; then + test -f "$mkllibdir/libmkl_solver_lp64.a" && \ + linkline="$linkline $mkllibdir/libmkl_solver_lp64.a" + else + test -f "$mkllibdir/libmkl_solver_lp64_sequential.a" && \ + linkline="$linkline $mkllibdir/libmkl_solver_lp64_sequential.a" + fi linkline="$linkline -Wl,--start-group" - for file in $sequential_libs; do - local libfile=$mkllibdir/lib$file.a + for file in ${mkl_libs[$threaded]}; do + local libfile=$mkllibdir/lib${file}.a check_exists $libfile linkline="$linkline $libfile" - done - #linkline="$linkline -Wl,--end-group -liomp5 -lpthread -lm " - linkline="$linkline -Wl,--end-group " - echo $linkline - else - return 1; + done + linkline="$linkline -Wl,--end-group " fi + echo "$linkline" } - -function linux_configure_mkl_includes { - test -d $1/include && echo "$1/include" && return; - test -d $2/../../include && echo "$2/../../include" && return; - failure "Could not find the MKL include directory" -} - -function check_library { - local libpath=$1 - local libname=$2 - local libext=$3 - - local full_libname="$libpath/$libname.$libext" - ##echo "Testing $full_libname" >&2 - test -f "$full_libname" && return ; - return 1 +function linux_configure_mkl_extra { + local static=$1 + local threaded=$2 + + declare -A extra_libs + extra_libs=( + [sequential]="-ldl -lpthread -lm" + [gomp]="-lgomp -ldl -lpthread -lm" + [iomp]="-ldl -lpthread -lm" + [tbb]=" -ldl -lpthread -lm " + ) + echo "$linkline ${extra_libs[$threaded]}" } +function linux_configure_threadinglibdir { + local library=$1 + local mklroot=$2 + local mkllibdir=$3 + local libexts=$4 -function linux_configure_omplibdir { - local mklroot=$1 - local mkllibdir=$2 - local libexts=$3 - - ##First we try to use the library in the same directory ##where the mkl libraries reside ##Afterwards, just try some possibilities for different MKL layouts for libext in $libexts; do - echo "Testing $libext from [$libexts] " >&2 + check_library $mkllibdir "lib$library" $libext \ + && echo `readlink -f $mkllibdir` && return 0 - check_library $mkllibdir "libiomp5" $libext \ - && echo `readlink -f $mkllibdir` && return - local testdir=`(cd $mklroot; cd ..; cd lib/intel64;pwd)` - test -d $testdir && check_library $testdir "libiomp5" $libext && echo `readlink -f $testdir` && return; + test -d $testdir && check_library $testdir "lib$library" $libext && echo `readlink -f $testdir` && return 0; local testdir=`(cd $mklroot; cd ..; cd lib/em64t;pwd)` - test -d $testdir && check_library $testdir "libiomp5" $libext && echo `readlink -f $testdir` && return; - + test -d $testdir && check_library $testdir "lib$library" $libext && echo `readlink -f $testdir` && return 0; + local testdir=`(cd $mkllibdir; cd ../../..; cd lib/intel64;pwd)` - test -d $testdir && check_library $testdir "libiomp5" $libext && echo `readlink -f $testdir` && return; + test -d $testdir && check_library $testdir "lib$library" $libext && echo `readlink -f $testdir` && return 0; local testdir=`(cd $mklroot; cd ../../..; cd lib/em64t;pwd)` - test -d $testdir && check_library $testdir "libiomp5" $libext && echo `readlink -f $testdir` && return; + test -d $testdir && check_library $testdir "lib$library" $libext && echo `readlink -f $testdir` && return 0; done #failure "Could not find the library iomp5, use the configure switch --omp-libdir" return 1 } -#Check if at least one of these variables is set -#If yes, we want to switch to using the MKL -is_set $MKLLIBDIR && echo "Force-configuring KALDI to use MKL" && export MATHLIB="MKL" -is_set $MKLROOT && echo "Force-configuring KALDI to use MKL"&& export MATHLIB="MKL" -is_set $CLAPACKROOT && echo "Force-configuring KALDI to use CLAPACK"&& export MATHLIB="CLAPACK" -is_set $OPENBLASROOT && echo "Force-configuring KALDI to use OPENBLAS"&& export MATHLIB="OPENBLAS" +function linux_configure_mkl_threading { + local mklroot=$1 + local mkllibdir=$2 + local static=$3 + local threading=$4 + + declare -A libs + libs=( + [sequential]="" + [gomp]="" + [iomp]="iomp5" + [tbb]="tbb" + ) + + echo >&2 "Configuring MKL threading as $threading" + library=${libs[$threading]} + if [ -z "$library" ]; then + return 0 + fi + + if ! is_set $OMPLIBDIR ; then + if $static ; then + OMPLIBDIR=`linux_configure_threadinglibdir $library "$MKLROOT" "$MKLLIBDIR" "a"` + else + OMPLIBDIR=`linux_configure_threadinglibdir $library "$MKLROOT" "$MKLLIBDIR" "so"` + fi + fi + + check_library $OMPLIBDIR "lib$library" "a" || \ + check_library $OMPLIBDIR "lib$library" "so" || \ + failure "Could not find the $library library, have your tried the --omp-libdir switch?" + + OMP_LINK_LINE='' + # TODO(arnab): in the following conditional, the $static_math test is + # needed since the OpenMP library is assumed to be dynamic. + if [ "$OMPLIBDIR" != "$MKLLIBDIR" ] ; then + OMP_LINK_LINE="-L${OMPLIBDIR}" + fi + #if the libiomp5 library is dynamic, we add the rpath attribute + if ! $static_math ; then + OMP_LINK_LINE="$OMP_LINK_LINE -Wl,-rpath=$OMPLIBDIR -l$library" + else + OMP_LINK_LINE="$OMP_LINK_LINE -Wl,-Bstatic -l$library -Wl,-Bdynamic" + fi + echo "$OMP_LINK_LINE" +} ## -##CUDA is used in src/cudamatrix and src/nnet{,bin} only. -##It is used to accelerate the neural network training, -##the rest of kaldi is running on CPUs. +## CUDA is used only in selected directories including src/cudamatrix, src/nnet* +## and src/chain*. It is used to accelerate the neural network training, the +## rest of kaldi runs on CPUs. ## -function linux_configure_cuda { +function configure_cuda { #check for CUDA toolkit in the system - if [ ! $CUDATKDIR ]; then + if [ ! -d "$CUDATKDIR" ]; then for base in /Developer/NVIDIA/CUDA-6.0 /usr/local/share/cuda /usr/local/cuda /pkgs_local/cuda-3.2/ /opt/nvidia_cuda/cuda-6.0/ /usr/; do if [ -f $base/bin/nvcc ]; then CUDATKDIR=$base @@ -319,7 +426,7 @@ function linux_configure_cuda { done fi - if [ $CUDATKDIR ]; then + if [ -d "$CUDATKDIR" ]; then if [ ! -f $CUDATKDIR/bin/nvcc ]; then failure "Cannnot find nvcc in CUDATKDIR=$CUDATKDIR" fi @@ -329,45 +436,71 @@ function linux_configure_cuda { echo CUDA = true >> kaldi.mk echo CUDATKDIR = $CUDATKDIR >> kaldi.mk + # Determine 'CUDA_ARCH', + CUDA_VERSION=$($CUDATKDIR/bin/nvcc -V | tr '.,' '_ ' | awk '/release/{sub(/.*release/,""); print $1;}') # MAJOR_MINOR, + if [ -z "$CUDA_VERSION" ] ; then + echo "Cannot figure out CUDA_VERSION from the nvcc output. Either your CUDA is too new or too old." + exit 1 + fi + + case $CUDA_VERSION in + 5_5) CUDA_ARCH="-gencode arch=compute_13,code=sm_13 -gencode arch=compute_20,code=sm_20 -gencode arch=compute_30,code=sm_30 -gencode arch=compute_35,code=sm_35" ;; + 6_*) CUDA_ARCH="-gencode arch=compute_13,code=sm_13 -gencode arch=compute_20,code=sm_20 -gencode arch=compute_30,code=sm_30 -gencode arch=compute_35,code=sm_35 -gencode arch=compute_50,code=sm_50" ;; + 7_*) CUDA_ARCH="-gencode arch=compute_20,code=sm_20 -gencode arch=compute_30,code=sm_30 -gencode arch=compute_35,code=sm_35 -gencode arch=compute_50,code=sm_50 -gencode arch=compute_53,code=sm_53" ;; + *) echo "Unsupported CUDA_VERSION (CUDA_VERSION=$CUDA_VERSION), please report it to Kaldi mailing list, together with 'nvcc -h' or 'ptxas -h' which lists allowed -gencode values..."; exit 1 ;; + esac + echo "CUDA_ARCH = $CUDA_ARCH" >> kaldi.mk + + # 64bit/32bit? if [ "`uname -m`" == "x86_64" ]; then - cat makefiles/linux_x86_64_cuda.mk >> kaldi.mk + if [ "`uname`" == "Darwin" ]; then + sed 's/lib64/lib/g' < makefiles/cuda_64bit.mk >> kaldi.mk + else + cat makefiles/cuda_64bit.mk >> kaldi.mk + fi else - cat makefiles/linux_cuda.mk >> kaldi.mk + cat makefiles/cuda_32bit.mk >> kaldi.mk fi + else - echo "CUDA will not be used! If you have already installed cuda drivers and cuda toolkit, try using --cudatk-dir=... option. Note: this is only relevant for neural net experiments" + echo "CUDA will not be used! If you have already installed cuda drivers " + echo "and cuda toolkit, try using --cudatk-dir=... option. Note: this is" + echo "only relevant for neural net experiments" fi } function linux_configure_speex { #check whether the user has called tools/extras/install_speex.sh or not - SPEEXROOT=`pwd`/../tools/speex + [ ! -z "$SPEEXROOT" ] || SPEEXROOT=`pwd`/../tools/speex + [ ! -z "$SPEEXLIBDIR" ] || SPEEXLIBDIR="$SPEEXROOT"/lib + [ ! -z "$SPEEXINCLUDEDIR" ] || SPEEXINCLUDEDIR="$SPEEXROOT"/include static_speex=$1 if [ "foo"$static_speex == "foo" ]; then static_speex=false fi - + if $static_speex; then spx_type=a else spx_type=so fi - if [ ! -f "$SPEEXROOT/lib/libspeex.${spx_type}" ];then - echo "Static=[$static_speex] Speex library not found: You can still build Kaldi without Speex." + if [ ! -f "$SPEEXLIBDIR/libspeex.${spx_type}" ];then + echo "Info: configuring Kaldi not to link with Speex (don't worry, it's only needed if you" + echo "intend to use 'compress-uncompress-speex', which is very unlikely)" return fi - - if [ -f $SPEEXROOT/include/speex/speex.h ]; then + + if [ -f $SPEEXINCLUDEDIR/speex/speex.h ]; then echo >> kaldi.mk - echo CXXFLAGS += -DHAVE_SPEEX -I${SPEEXROOT}/include >> kaldi.mk - + echo CXXFLAGS += -DHAVE_SPEEX -I${SPEEXINCLUDEDIR} >> kaldi.mk + if $static_speex; then - echo LDLIBS += $SPEEXROOT/lib/libspeex.a + echo LDLIBS += $SPEEXLIBDIR/libspeex.a else - echo LDLIBS += -L${SPEEXROOT}/lib -lspeex >> kaldi.mk - echo LDFLAGS += -Wl,-rpath=${SPEEXROOT}/lib >> kaldi.mk + echo LDLIBS += -L${SPEEXLIBDIR} -lspeex >> kaldi.mk + echo LDFLAGS += -Wl,-rpath=${SPEEXLIBDIR} >> kaldi.mk fi - + echo "Successfully configured with Speex at $SPEEXROOT, (static=[$static_speex])" else echo "Speex will not be used. If you want to use it, run tools/extras/install_speex.sh first." @@ -391,15 +524,18 @@ function linux_atlas_failure { # function we use when we couldn't find fix_cxx_flag echo "** $* ***" echo "** ERROR **" - echo "**Configure cannot proceed automatically, but by editing kaldi.mk" - echo "** you may be able to proceed (replace [somewhere] with a directory);" - echo "** or install the ATLAS package on your machine (if you are system " - echo " administrator, you can do it easily by searching the atlas packages " - echo " with commands like 'apt-cache search libatlas' or 'yum search atlas'," - echo " and install the packages with commands 'apt-get install' or 'yum install') " - echo " e.g. 'apt-get install libatlas-dev libatlas-base-dev';" - echo "** or try going to ../tools and running install_atlas.sh, and running" - echo " this script (configure) again." + echo "** Configure cannot proceed automatically." + echo "** If you know that you have ATLAS installed somewhere on your machine, you" + echo "** may be able to proceed by replacing [somewhere] in kaldi.mk with a directory." + echo "** If you have sudo (root) access you could install the ATLAS package on your" + echo "** machine, e.g. 'sudo apt-get install libatlas-dev libatlas-base-dev' or" + echo "** 'sudo yum install atlas.x86_64' or 'sudo zypper install libatlas3-devel'," + echo "** or on cygwin, install atlas from the installer GUI; and then run ./configure" + echo "** again." + echo "**" + echo "** Otherwise (or if you prefer OpenBLAS for speed), you could go the OpenBLAS" + echo "** route: cd to ../tools, type 'extras/install_openblas.sh', cd back to here," + echo "** and type './configure --openblas-root=../tools/OpenBLAS/install'" exit 1; } @@ -426,11 +562,11 @@ function linux_check_static { function linux_configure_debian_ubuntu { m=$1 ATLASLIBS="/usr/lib$m/atlas-base/libatlas.so.3gf /usr/lib$m/atlas-base/libf77blas.so.3gf /usr/lib$m/atlas-base/libcblas.so.3gf /usr/lib$m/atlas-base/liblapack_atlas.so.3gf" - for f in $ATLASLIBS; do + for f in $ATLASLIBS; do [ ! -f $f ] && return 1; done lapacklib=$(echo $ATLASLIBS | awk '{print $NF}') - if ! nm --dynamic $lapacklib | grep ATL_cgetrf >/dev/null; then + if ! nm --dynamic $lapacklib | grep ATL_cgetrf >/dev/null; then exit 1; fi echo ATLASINC = $ATLASROOT/include >> kaldi.mk @@ -438,18 +574,18 @@ function linux_configure_debian_ubuntu { cat makefiles/linux_atlas.mk >> kaldi.mk fix_cxx_flag echo "Successfully configured for Debian/Ubuntu Linux [dynamic libraries] with ATLASLIBS =$ATLASLIBS" - $use_cuda && linux_configure_cuda + $use_cuda && configure_cuda linux_configure_speex exit_success; } function linux_configure_debian_ubuntu3 { ATLASLIBS="/usr/lib/libatlas.so.3 /usr/lib/libf77blas.so.3 /usr/lib/libcblas.so.3 /usr/lib/liblapack_atlas.so.3" - for f in $ATLASLIBS; do + for f in $ATLASLIBS; do [ ! -f $f ] && return 1; done lapacklib=$(echo $ATLASLIBS | awk '{print $NF}') - if ! nm --dynamic $lapacklib | grep ATL_cgetrf >/dev/null; then + if ! nm --dynamic $lapacklib | grep ATL_cgetrf >/dev/null; then exit 1; fi echo ATLASINC = $ATLASROOT/include >> kaldi.mk @@ -457,29 +593,29 @@ function linux_configure_debian_ubuntu3 { cat makefiles/linux_atlas.mk >> kaldi.mk fix_cxx_flag echo "Successfully configured for Debian/Ubuntu Linux [dynamic libraries] with ATLASLIBS =$ATLASLIBS" - $use_cuda && linux_configure_cuda + $use_cuda && configure_cuda linux_configure_speex exit_success; } function linux_configure_debian7 { ATLASLIBS="/usr/lib/atlas-base/libatlas.so.3.0 /usr/lib/atlas-base/libf77blas.so.3.0 /usr/lib/atlas-base/libcblas.so.3 /usr/lib/atlas-base/liblapack_atlas.so.3" - for f in $ATLASLIBS; do + for f in $ATLASLIBS; do [ ! -f $f ] && return 1; done lapacklib=$(echo $ATLASLIBS | awk '{print $NF}') - if ! nm --dynamic $lapacklib | grep ATL_cgetrf >/dev/null; then + if ! nm --dynamic $lapacklib | grep ATL_cgetrf >/dev/null; then exit 1; fi libdir=$(dirname $(echo $ATLASLIBS | awk '{print $1}')) [ -z "$libdir" ] && echo "Error getting libdir in linux_configure_debian7" && exit 1; echo ATLASINC = $ATLASROOT/include >> kaldi.mk echo ATLASLIBS = $ATLASLIBS -Wl,-rpath=$libdir >> kaldi.mk - echo + echo cat makefiles/linux_atlas.mk >> kaldi.mk fix_cxx_flag echo "Successfully configured for Debian 7 [dynamic libraries] with ATLASLIBS =$ATLASLIBS" - $use_cuda && linux_configure_cuda + $use_cuda && configure_cuda linux_configure_speex exit_success; } @@ -487,18 +623,18 @@ function linux_configure_debian7 { function linux_configure_redhat { m=$1 # 64 or empty. ATLASLIBS="/usr/lib$m/atlas/libatlas.so.3 /usr/lib$m/atlas/libf77blas.so.3 /usr/lib$m/atlas/libcblas.so.3 /usr/lib$m/atlas/libclapack.so.3" - for f in $ATLASLIBS; do + for f in $ATLASLIBS; do [ ! -f $f ] && return 1; done libdir=$(dirname $(echo $ATLASLIBS | awk '{print $1}')) [ -z "$libdir" ] && echo "Error getting libdir in linux_configure_redhat" && exit 1; echo ATLASINC = $ATLASROOT/include >> kaldi.mk echo ATLASLIBS = $ATLASLIBS -Wl,-rpath=$libdir >> kaldi.mk - echo + echo cat makefiles/linux_atlas.mk >> kaldi.mk fix_cxx_flag echo "Successfully configured for red hat [dynamic libraries] with ATLASLIBS =$ATLASLIBS" - $use_cuda && linux_configure_cuda + $use_cuda && configure_cuda exit_success; } @@ -508,18 +644,18 @@ function linux_configure_redhat_fat { # See http://stackoverflow.com/questions/13439296/build-shared-libraries-in-atlas. m=$1 # 64 or empty. ATLASLIBS="/usr/lib$m/atlas/libsatlas.so.3 /usr/lib$m/atlas/libtatlas.so.3" - for f in $ATLASLIBS; do + for f in $ATLASLIBS; do [ ! -f $f ] && return 1; done libdir=$(dirname $(echo $ATLASLIBS | awk '{print $1}')) [ -z "$libdir" ] && echo "Error getting libdir in linux_configure_redhat_fat" && exit 1; echo ATLASINC = $ATLASROOT/include >> kaldi.mk echo ATLASLIBS = $ATLASLIBS -Wl,-rpath=$libdir >> kaldi.mk - echo + echo cat makefiles/linux_atlas.mk >> kaldi.mk fix_cxx_flag echo "Successfully configured for red hat [dynamic libraries, fat] with ATLASLIBS =$ATLASLIBS" - $use_cuda && linux_configure_cuda + $use_cuda && configure_cuda exit_success; } @@ -553,25 +689,25 @@ function linux_configure_static { fi fi done - if [ "$ATLASLIBS" == "" ]; then + if [ "$ATLASLIBS" == "" ]; then echo Could not find any libraries $ATLASLIBDIR/{liblapack,liblapack_atlas,libclapack} that seem to be an ATLAS CLAPACK library. return ; fi - + for x in lib${pt}cblas.a libatlas.a lib${pt}f77blas.a; do if [ ! -f $ATLASLIBDIR/$x ]; then echo "Configuring static ATLAS libraries failed: Could not find library $x in directory $ATLASLIBDIR" return 1; fi ATLASLIBS="$ATLASLIBS $ATLASLIBDIR/$x" - done + done if $threaded_atlas; then ATLASLIBS="$ATLASLIBS"; fi echo ATLASINC = $ATLASROOT/include >> kaldi.mk echo ATLASLIBS = $ATLASLIBS >> kaldi.mk cat makefiles/linux_atlas.mk >> kaldi.mk fix_cxx_flag - $use_cuda && linux_configure_cuda + $use_cuda && configure_cuda linux_configure_speex echo "Successfully configured for Linux [static libraries] with ATLASLIBS =$ATLASLIBS" exit_success; @@ -591,7 +727,7 @@ function linux_check_dynamic { return 0; fi done - echo "... no {libatlas,lib${pt}atlas}.so in $dir"; + # echo "... no {libatlas,lib${pt}atlas}.so in $dir"; return 1; } @@ -635,7 +771,7 @@ function linux_configure_dynamic { echo Could not find any libraries $ATLASLIBDIR/{liblapack,liblapack_atlas,libclapack} that seem to be an ATLAS CLAPACK library. return 1; fi - + for x in ${pt}cblas atlas ${pt}f77blas; do if [ ! -f $ATLASLIBDIR/lib$x.so ]; then echo "Configuring dynamic ATLAS libraries failed: Could not find library $x in directory $ATLASLIBDIR" @@ -650,7 +786,7 @@ function linux_configure_dynamic { echo ATLASLIBS = $ATLASLIBS >> kaldi.mk cat makefiles/linux_atlas.mk >> kaldi.mk fix_cxx_flag - $use_cuda && linux_configure_cuda + $use_cuda && configure_cuda linux_configure_speex echo "Successfully configured for Linux [dynamic libraries] with ATLASLIBS =$ATLASLIBS" exit_success; @@ -693,7 +829,7 @@ echo "CONFIGURE_VERSION := $CONFIGURE_VERSION" >> kaldi.mk echo "FSTROOT = $FSTROOT" >> kaldi.mk # Check installed OpenFst version and add C++11 flags if OpenFst >= 1.4 -OPENFST_VER=`grep 'PACKAGE_VERSION' $FSTROOT/Makefile | sed -e 's:.*= ::'` +OPENFST_VER="${OPENFST_VER:-`grep 'PACKAGE_VERSION' $FSTROOT/Makefile | sed -e 's:.*= ::'`}" echo "OPENFST_VER = $OPENFST_VER" >> kaldi.mk OPENFST_VER_NUM=`echo $OPENFST_VER | sed 's/\./ /g' | xargs printf "%d%02d%02d"` if [ $OPENFST_VER_NUM -ge 10400 ]; then @@ -710,7 +846,7 @@ echo "Doing OS specific configurations ..." # which crashes on Darwin. Also the linear algebra libraries on Macs are # used differently (through the Accelerate framework) than on Linux. if [ "`uname`" == "Darwin" ]; then - $use_cuda && linux_configure_cuda + $use_cuda && configure_cuda echo "On Darwin: checking for Accelerate framework ..." if [ ! -e /System/Library/Frameworks/Accelerate.framework ]; then failure "Need the Accelerate.framework to compile on Darwin." @@ -739,7 +875,10 @@ if [ "`uname`" == "Darwin" ]; then elif [ "$osx_ver" == "10.10" ]; then check_exists makefiles/darwin_10_10.mk cat makefiles/darwin_10_10.mk >> kaldi.mk - else + elif [ "$osx_ver" == "10.11" ]; then + check_exists makefiles/darwin_10_11.mk + cat makefiles/darwin_10_11.mk >> kaldi.mk + else failure "OS X version '$osx_ver' not supported" fi echo "Configuration succeeded for platform Darwin." @@ -780,7 +919,7 @@ if [ "`uname`" == "Linux" ]; then failure "Could not find required header files cblas.h or clapack.h in ATLAS dir '$ATLASROOT/include'" fi echo "Using ATLAS as the linear algebra library." - + # Finding out where the libraries are located: # First we look for the static libraries and then look for dynamic ones. # We're looking for four libraries, all in the same directory, named @@ -825,8 +964,8 @@ if [ "`uname`" == "Linux" ]; then failure "MKL on Linux only supported for Intel(R) 64 architecture (x86_64). See makefiles/linux_64_mkl.mk to manually configure for other platforms." fi - - if is_set "$MKLROOT" -a ! is_set "$MKLLIBDIR"; then + + if ( is_set "$MKLROOT" && ! is_set "$MKLLIBDIR" ); then echo -n "Configuring MKL library directory: " MKLLIBDIR=`linux_configure_mkllibdir $MKLROOT` if [ $? -ne 0 ]; then @@ -836,47 +975,38 @@ if [ "`uname`" == "Linux" ]; then fi fi - MKL_LINK_LINE=`linux_configure_mkl_libraries "$MKLLIBDIR" $static_math $threaded_math` || exit 1 + MKL_LINK_LINE=`linux_configure_mkl_libraries "$MKLLIBDIR" $static_math $mkl_threading` || exit 1 + echo "MKL configured with threading: $mkl_threading, libs: $MKL_LINK_LINE" MKL_COMPILE_LINE=`linux_configure_mkl_includes "$MKLROOT" "$MKLLIBDIR"` || exit 1 echo "MKL include directory configured as: $MKL_COMPILE_LINE" MKL_COMPILE_LINE=" -I${MKL_COMPILE_LINE} " - - if ! is_set $OMPLIBDIR ; then - if $static_math ; then - OMPLIBDIR=`linux_configure_omplibdir "$MKLROOT" "$MKLLIBDIR" "a"` - else - OMPLIBDIR=`linux_configure_omplibdir "$MKLROOT" "$MKLLIBDIR" "so"` - fi - fi - check_library $OMPLIBDIR "libiomp5" "a" || check_library $OMPLIBDIR "libiomp5" "so" \ - || failure "Could not find the iomp5 library, have your tried the --omp-libdir switch?" - echo "OMP library directory configured as: $OMPLIBDIR" - OMP_LINK_LINE='' - # TODO(arnab): in the following conditional, the $static_math test is - # needed since the OpenMP library is assumed to be dynamic. - if [ "$OMPLIBDIR" != "$MKLLIBDIR" ] ; then - OMP_LINK_LINE="-L${OMPLIBDIR}" - #if the libiomp5 library is dynamic, we add the rpath attribute - if ! $static_math ; then - OMP_LINK_LINE="$OMP_LINK_LINE -Wl,-rpath=$OMPLIBDIR" - else - OMP_LINK_LINE="$OMP_LINK_LINE -Wl,-Bstatic -liomp5 -Wl,-Bdynamic" - fi + + THREADING_LINE=`linux_configure_mkl_threading $MKLROOT $MKLLIBDIR $static_math $mkl_threading` || exit 1 + EXTRA_LIBS=`linux_configure_mkl_extra $static_math $mkl_threading` || exit 1 + if [ ! -z "$THREADING_LINE" ] || [ ! -z "$EXTRA_LIBS" ]; then + echo "MKL threading libraries configured as $THREADING_LINE $EXTRA_LIBS" fi - + echo "Using Intel MKL as the linear algebra library." + ( + cd probe; rm -f mkl-test; + g++ mkl-test.cc -o mkl-test $MKL_COMPILE_LINE $MKL_LINK_LINE $THREADING_LINE $EXTRA_LIBS || exit 1 + test -f ./mkl-test || exit 1 + ./mkl-test || exit 1 + cd .. + ) || failure "Cannot validate the MKL switches" echo MKLROOT = $MKLROOT >> kaldi.mk - if [ ! -z $MKLLIBDIR ]; then + if [ ! -z $MKLLIBDIR ]; then echo MKLLIB = $MKLLIBDIR >> kaldi.mk fi check_exists makefiles/linux_x86_64_mkl.mk cat makefiles/linux_x86_64_mkl.mk >> kaldi.mk fix_cxx_flag - echo "MKLFLAGS = ${MKL_LINK_LINE} ${OMP_LINK_LINE} " >> kaldi.mk + echo "MKLFLAGS = ${MKL_LINK_LINE} ${THREADING_LINE} $EXTRA_LIBS " >> kaldi.mk - $use_cuda && linux_configure_cuda + $use_cuda && configure_cuda linux_configure_speex echo "Successfully configured for Linux with MKL libs from $MKLROOT" exit_success; @@ -899,7 +1029,7 @@ if [ "`uname`" == "Linux" ]; then cat makefiles/linux_clapack.mk >> kaldi.mk fix_cxx_flag echo "Warning (CLAPACK): this part of the configure process is not properly tested and will not work." - $use_cuda && linux_configure_cuda + $use_cuda && configure_cuda linux_configure_speex echo "Successfully configured for Linux with CLAPACK libs from $CLAPACKROOT" exit_success; @@ -908,7 +1038,7 @@ if [ "`uname`" == "Linux" ]; then if [ -z "$OPENBLASROOT" ]; then failure "Must specify the location of OPENBLAS with --openblas-root option (and it must exist)" fi - if [ ! -f $OPENBLASROOT/lib/libopenblas.so ]; then + if [ ! -f $OPENBLASROOT/lib/libopenblas.so ]; then failure "Expected to find the file $OPENBLASROOT/lib/libopenblas.so" fi echo "Your math library seems to be OpenBLAS. Configuring appropriately." @@ -923,11 +1053,11 @@ if [ "`uname`" == "Linux" ]; then echo "OPENBLASROOT = $OPENBLASROOT" >> kaldi.mk cat makefiles/linux_openblas.mk >> kaldi.mk fix_cxx_flag - $use_cuda && linux_configure_cuda + $use_cuda && configure_cuda linux_configure_speex echo "Successfully configured OpenBLAS from $OPENBLASROOT." exit_success; - else + else failure "Unsupported linear algebra library '$MATHLIB'" fi fi diff --git a/src/cudamatrix/Makefile b/src/cudamatrix/Makefile index 34b621b428f..1bfb087540a 100644 --- a/src/cudamatrix/Makefile +++ b/src/cudamatrix/Makefile @@ -1,68 +1,36 @@ - all: -OPENFST_CXXFLAGS = -OPENFST_LDLIBS = - - include ../kaldi.mk - LDFLAGS += $(CUDA_LDFLAGS) LDLIBS += $(CUDA_LDLIBS) TESTFILES = cu-vector-test cu-matrix-test cu-math-test cu-test cu-sp-matrix-test cu-packed-matrix-test cu-tp-matrix-test \ cu-block-matrix-test cu-matrix-speed-test cu-vector-speed-test cu-sp-matrix-speed-test cu-array-test \ - cu-sparse-matrix-test - + cu-sparse-matrix-test cu-device-test OBJFILES = cu-device.o cu-math.o cu-matrix.o cu-packed-matrix.o cu-sp-matrix.o \ cu-vector.o cu-common.o cu-tp-matrix.o cu-rand.o cu-block-matrix.o \ - cu-sparse-matrix.o + cu-sparse-matrix.o cu-allocator.o cu-array.o ifeq ($(CUDA), true) OBJFILES += cu-kernels.o cu-randkernels.o endif LIBNAME = kaldi-cudamatrix -all: $(LIBFILE) - +ADDLIBS = ../matrix/kaldi-matrix.a ../util/kaldi-util.a ../thread/kaldi-thread.a \ + ../base/kaldi-base.a +# Make sure we have CUDA_ARCH from kaldi.mk, ifeq ($(CUDA), true) - #Default compute capability architectures we compile with - CUDA_ARCH=-gencode arch=compute_20,code=sm_20 - #Get the CUDA Toolkit version (remove decimal point char) - CUDA_VERSION=$(shell $(CUDATKDIR)/bin/nvcc -V | grep release | sed -e 's|.*release ||' -e 's|,.*||' -e 's|\.||') - #For toolkit 4.2 or newer, add the compute capability 3.0 - CUDA_VER_GT_4_2 := $(shell [ $(CUDA_VERSION) -ge 42 ] && echo true) - ifeq ($(CUDA_VER_GT_4_2), true) - CUDA_ARCH += -gencode arch=compute_30,code=sm_30 - endif - #For toolkit 5.0 or newer, add the compute capability 3.5 - CUDA_VER_GT_5_0 := $(shell [ $(CUDA_VERSION) -ge 50 ] && echo true) - ifeq ($(CUDA_VER_GT_5_0), true) - CUDA_ARCH += -gencode arch=compute_35,code=sm_35 - endif - #For toolkit 6.0 or newer, add the compute capability 5.0 - CUDA_VER_GT_6_0 := $(shell [ $(CUDA_VERSION) -ge 60 ] && echo true) - ifeq ($(CUDA_VER_GT_6_0), true) - CUDA_ARCH += -gencode arch=compute_50,code=sm_50 - endif - #For toolkit older than 6.5, add the compute capability 1.0 - CUDA_VER_GT_6_5 := $(shell [ $(CUDA_VERSION) -ge 65 ] && echo true) - ifneq ($(CUDA_VER_GT_6_5), true) - CUDA_ARCH += -gencode arch=compute_13,code=sm_13 \ - -gencode arch=compute_10,code=sm_10 + ifndef CUDA_ARCH + $(error CUDA_ARCH is undefined, run 'src/configure') endif endif - -#implicit rule for kernel compilation +# Implicit rule for kernel compilation, %.o : %.cu $(CUDATKDIR)/bin/nvcc -c $< -o $@ $(CUDA_INCLUDE) $(CUDA_FLAGS) $(CUDA_ARCH) -I../ - -ADDLIBS = ../matrix/kaldi-matrix.a ../base/kaldi-base.a ../util/kaldi-util.a - include ../makefiles/default_rules.mk diff --git a/src/cudamatrix/cu-allocator.cc b/src/cudamatrix/cu-allocator.cc new file mode 100644 index 00000000000..eacfbdf3c8e --- /dev/null +++ b/src/cudamatrix/cu-allocator.cc @@ -0,0 +1,370 @@ +// cudamatrix/cu-allocator.cc + +// Copyright 2015 Johns Hopkins University (author: Daniel Povey) + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + + + +#if HAVE_CUDA == 1 + +#include +#include +#include + +#include +#include +#include +#include "cudamatrix/cu-common.h" +#include "cudamatrix/cu-device.h" +#include "cudamatrix/cu-matrix.h" +#include "base/kaldi-error.h" +#include "base/kaldi-utils.h" +#include "util/common-utils.h" + +namespace kaldi { + + +void* CuMemoryAllocator::Malloc(size_t size) { + // For now just call MallocPitch and throw away the pitch, to avoid + // duplicating code here. Apparently the time difference is quite small. + size_t pitch; + return MallocPitch(size, 1, &pitch); +} + +// Returns max(0, floor(log_2(i))). Not tested independently. +static inline size_t IntegerLog2(size_t i) { + size_t ans = 0; + while (i > 256) { + i >>= 8; + ans += 8; + } + while (i > 16) { + i >>= 4; + ans += 4; + } + while (i > 1) { + i >>= 1; + ans++; + } + return ans; +} + +//inline +CuMemoryAllocator::MruCache& CuMemoryAllocator::GetCacheForSize( + size_t num_bytes) { + size_t bucket_index = IntegerLog2(num_bytes); + KALDI_ASSERT(num_bytes > 0 && bucket_index < caches_.size()); + return caches_[bucket_index]; +} + +//inline +void* CuMemoryAllocator::MallocPitchInternal(size_t row_bytes, + size_t num_rows, + size_t *pitch) { + num_system_allocations_++; + void *ans; + cudaError_t e; + for (int32 i = 0; i <= 2; i++) { + if (num_rows != 1) { + Timer tim; + e = cudaMallocPitch(&ans, pitch, row_bytes, num_rows); + tot_time_taken_in_cuda_malloc_pitch_ += tim.Elapsed(); + } else { + Timer tim; + // we might save a little time this way. + e = cudaMalloc(&ans, row_bytes); + tot_time_taken_in_cuda_malloc_ += tim.Elapsed(); + *pitch = row_bytes; + } + if (e != cudaSuccess) { + PrintMemoryUsage(); + // On the first 2 out of the 3 iters, try freeing memory. + if (i <= 1) { + KALDI_WARN << "Allocation of " << row_bytes << " x " + << num_rows << " region failed: freeing some memory and " + << "trying again. "; + BaseFloat new_memory_factor = 1.1; + if (opts_.memory_factor > new_memory_factor) { + KALDI_LOG << "To avoid future problems like this, changing " + << "memory_factor from " << opts_.memory_factor << " to " + << new_memory_factor; + opts_.memory_factor = new_memory_factor; + } + size_t memory_cached = MemoryCached(), + memory_requested = row_bytes * num_rows, + memory_to_free = std::max(memory_cached / 2, + std::min(memory_cached, + memory_requested)); + FreeSomeCachedMemory(memory_to_free); + } else { + KALDI_ERR << "Cannot allocate the requested memory (" + << row_bytes << " x " << num_rows << " = " + << row_bytes * num_rows << " bytes)"; + } + cudaGetLastError(); // Clear the error state. + } else { + break; + } + } + return ans; +} + +void CuMemoryAllocator::PrintMemoryUsage() const { + KALDI_LOG << "Memory usage: " << cur_bytes_allocated_ + << " bytes currently allocated (max: " + << max_bytes_allocated_ << "); " << cur_bytes_used_ + << " currently in use by user (max: " << max_bytes_used_ << ")" + << "; " << num_system_allocations_ << '/' + << num_user_allocations_ << " calls to Malloc* resulted in " + << "CUDA calls."; + KALDI_LOG << "Time taken in cudaMallocPitch=" << tot_time_taken_in_cuda_malloc_pitch_ + << ", in cudaMalloc=" << tot_time_taken_in_cuda_malloc_ + << ", in cudaFree=" << tot_time_taken_in_cuda_free_ + << ", in this->MallocPitch()=" << tot_time_taken_in_malloc_pitch_; +} + +CuMemoryAllocator::CuMemoryAllocator(CuAllocatorOptions opts): + opts_(opts), + caches_(40), + cur_bytes_allocated_(0), + max_bytes_allocated_(0), + cur_bytes_used_(0), + max_bytes_used_(0), + t_(1), + num_user_allocations_(0), + num_system_allocations_(0), + tot_time_taken_in_cuda_malloc_(0.0), + tot_time_taken_in_cuda_malloc_pitch_(0.0), + tot_time_taken_in_cuda_free_(0.0), + tot_time_taken_in_malloc_pitch_(0.0) { } + +void* CuMemoryAllocator::MallocPitch(size_t row_bytes, + size_t num_rows, + size_t *pitch) { + Timer tim; + t_++; + num_user_allocations_++; + size_t requested_bytes = row_bytes * num_rows; + if (cur_bytes_used_ + requested_bytes > max_bytes_used_) + max_bytes_used_ = cur_bytes_used_ + requested_bytes; + MruCache &cache = GetCacheForSize(requested_bytes); + MemoryRequest request(row_bytes, num_rows); + CachedMemoryElement output; + if (cache.Lookup(request, &output)) { + // we have cached memory with this value. + void *ans = output.pointer; + *pitch = output.pitch; + used_map_[ans] = UsedMemoryElement(row_bytes, num_rows, output.pitch); + cur_bytes_used_ += requested_bytes; + tot_time_taken_in_malloc_pitch_ += tim.Elapsed(); + return ans; + } else { + // note: it's important that we already updated max_bytes_used_. + size_t next_bytes_allocated = cur_bytes_allocated_ + requested_bytes, + max_bytes_to_allocate = + static_cast(opts_.memory_factor * max_bytes_used_); + ssize_t bytes_overflow = next_bytes_allocated - max_bytes_to_allocate; + if (bytes_overflow > 0) { + // The amount we would have allocated, after fulfilling this request, + // would exceed our limits (we don't allow ourselves to allocate more than + // memory_factor times the maximum amount of memory the user ever owns + // during the lifetime of the program). So free some memory. + KALDI_ASSERT(bytes_overflow <= MemoryCached()); // sanity check. + FreeSomeCachedMemory(static_cast(bytes_overflow)); + KALDI_ASSERT(cur_bytes_allocated_ + requested_bytes <= + max_bytes_to_allocate); + } + void *ans = MallocPitchInternal(row_bytes, num_rows, pitch); + cur_bytes_allocated_ += requested_bytes; + if (cur_bytes_allocated_ > max_bytes_allocated_) + max_bytes_allocated_ = cur_bytes_allocated_; + used_map_[ans] = UsedMemoryElement(row_bytes, num_rows, *pitch); + cur_bytes_used_ += requested_bytes; + tot_time_taken_in_malloc_pitch_ += tim.Elapsed(); + return ans; + } +} + +void CuMemoryAllocator::FreeSomeCachedMemory(size_t bytes_to_free_in) { + Timer tim; + // the next few lines are responsible for increasing the amount of memory we + // are going to free, in case the user requested an amount that's very tiny + // compared with the total amount of memory ever used. This helps us + // to amortize the cost of visiting all of the buckets inside this code. + // (there are only 40 buckets so it's not so big, but we're being careful. + size_t bytes_cached = cur_bytes_allocated_ - cur_bytes_used_, + min_to_free = static_cast(max_bytes_used_ * opts_.delete_factor); + size_t bytes_to_free = std::min(bytes_cached, + std::max(bytes_to_free_in, min_to_free)), + bytes_freed = 0; + + size_t num_caches = caches_.size(), + t = t_; + // size_factor contains the approximate (power-of-two) size of the pointers + // that each cache's pointers contain. The 'cost' of keeping any given pointer, + // we declare to be the time since we last used it multiplied by the size + // of the memory in the pointer. + std::vector size_factor(num_caches); + for (size_t i = 0, j=1; i < num_caches; i++, j *= 2) + size_factor[i] = j; + + std::priority_queue > queue; + // Set up the queue. + for (int32 i = 0; i < num_caches; i++) { + const MruCache &cache = caches_[i]; + size_t cache_t = cache.LeastRecentTime(); + if (cache_t > 0) { // t == 0 means the cache is empty. + size_t interval = t - cache_t; + BaseFloat cost = size_factor[i] * interval; + KALDI_ASSERT(interval > 0); + queue.push(std::pair(cost, i)); + } + } + while (bytes_freed < bytes_to_free) { + // If the following fails it means I made some kind of bookkeeping error, + // and most likely we are trying to free more memory than we really have + // cached. + KALDI_ASSERT(!queue.empty() && "Code error."); + std::pair p = queue.top(); + int32 cache_index = p.second; + MruCache &cache = caches_[cache_index]; + queue.pop(); + if (queue.empty()) { + while (bytes_freed < bytes_to_free) { + bytes_freed += cache.RemoveLeastRecentlyUsed(); + } + } else { + BaseFloat next_worst_cost = queue.top().first; + while (1) { + bytes_freed += cache.RemoveLeastRecentlyUsed(); + if (bytes_freed >= bytes_to_free) + break; + size_t least_recent_time = cache.LeastRecentTime(); + if (least_recent_time == 0) // this cache is now empty + break; + size_t interval = t - least_recent_time; + KALDI_ASSERT(interval > 0); + BaseFloat cost = size_factor[cache_index] * interval; + if (cost < next_worst_cost) { + // There is another bucket that has worse cost than this, + // so stop processing this bucket-- but first put it + // back in the queue. + queue.push(std::pair(cost, cache_index)); + break; + } + } + } + } + KALDI_ASSERT(bytes_freed <= cur_bytes_allocated_); + cur_bytes_allocated_ -= bytes_freed; + tot_time_taken_in_cuda_free_ += tim.Elapsed(); +} + +void CuMemoryAllocator::Free(void *ptr) { + t_++; + unordered_map::iterator iter = + used_map_.find(ptr); + if (iter == used_map_.end()) { + KALDI_ERR << "Attempt to free CUDA memory pointer that was not allocated: " + << ptr; + } + const UsedMemoryElement &elem = iter->second; + size_t num_bytes = elem.row_bytes * elem.num_rows; + + cur_bytes_used_ -= num_bytes; + MruCache &cache = GetCacheForSize(num_bytes); + + cache.Insert(MemoryRequest(elem.row_bytes, elem.num_rows), + CachedMemoryElement(ptr, t_, elem.pitch)); + used_map_.erase(iter); +} + +size_t CuMemoryAllocator::MruCache::LeastRecentTime() const { + if (list_.empty()) { + KALDI_PARANOID_ASSERT(map_.empty()); + return 0; + } else { + const MemoryRequest &mr = list_.front(); + MapType::const_iterator iter = map_.find(mr); + KALDI_ASSERT(iter != map_.end()); + const MapValueType &queue = iter->second; + KALDI_ASSERT(!queue.empty()); + return queue.front().first.t; + } +} + +bool CuMemoryAllocator::MruCache::Lookup(const MemoryRequest &request, + CachedMemoryElement *output) { + MapType::iterator iter = map_.find(request); + if (iter == map_.end()) + return false; + MapValueType &q = iter->second; + KALDI_ASSERT(!q.empty()); + // use q.back() as we want to return the most recently used one if there + // is a choice. We believe this will give better caching behavior. + *output = q.back().first; + list_.erase(q.back().second); + q.pop_back(); + if (q.empty()) + map_.erase(request); + return true; +} + +void CuMemoryAllocator::MruCache::Insert(const MemoryRequest &request, + const CachedMemoryElement &element) { + list_.push_back(request); + map_[request].push_back(std::pair( + element, + --list_.end())); +} + +size_t CuMemoryAllocator::MruCache::RemoveLeastRecentlyUsed() { + // Remove least-recently-used element from cache. + KALDI_ASSERT(!list_.empty()); + MemoryRequest request = list_.front(); + MapType::iterator iter = map_.find(request); + KALDI_ASSERT(iter != map_.end()); + MapValueType &queue = iter->second; + KALDI_ASSERT(!queue.empty()); + // least recently used elements are at the front of the queue. + std::pair &p = queue.front(); + KALDI_ASSERT(p.second == list_.begin()); + CU_SAFE_CALL(cudaFree(p.first.pointer)); + queue.pop_front(); + if (queue.empty()) + map_.erase(request); + list_.pop_front(); + return request.first * request.second; +} + +CuMemoryAllocator::MruCache& CuMemoryAllocator::MruCache::operator = ( + const CuMemoryAllocator::MruCache &other) { + KALDI_ASSERT(other.list_.empty()); + return *this; +} +CuMemoryAllocator::MruCache::MruCache( + const CuMemoryAllocator::MruCache &other) { + KALDI_ASSERT(other.list_.empty()); +} + + + + +} + + +#endif // HAVE_CUDA diff --git a/src/cudamatrix/cu-allocator.h b/src/cudamatrix/cu-allocator.h new file mode 100644 index 00000000000..b10601b8245 --- /dev/null +++ b/src/cudamatrix/cu-allocator.h @@ -0,0 +1,229 @@ +// cudamatrix/cu-allocator.h + +// Copyright 2015 Johns Hopkins University (author: Daniel Povey) + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + + + +#ifndef KALDI_CUDAMATRIX_CU_ALLOCATOR_H_ +#define KALDI_CUDAMATRIX_CU_ALLOCATOR_H_ + +#if HAVE_CUDA == 1 + +#include +#include +#include +#include +#include +#include +#include +#include "base/kaldi-common.h" +#include "util/stl-utils.h" + +namespace kaldi { + + +// For now we don't give the user a way to modify these from the command line. +struct CuAllocatorOptions { + // memory_factor is the total amount of (allocated + cached) memory that we + // allow to be held, relative to the max amount of memory the program has ever + // allocated. It will increase the amount of memory the program will + // potentially consume, by this factor. + BaseFloat memory_factor; + + // This is the minimum amount of memory that we will delete when we are forced + // to delete stuff, relative to the max amount of memory the program has ever + // allocated. This should be less than memory_factor - 1.0 and > 0. It + // shouldn't be too critical. The reason it exists is to avoid calling the + // cleanup code and only releasing very small amounts of memory, because there + // is a constant overhead proportional to the number of buckets. + BaseFloat delete_factor; + + CuAllocatorOptions(): memory_factor(1.5), + delete_factor(0.001) { } + + void Check() { + KALDI_ASSERT(delete_factor < memory_factor - 1.0 && delete_factor > 0.0); + } +}; + + + + +// Class that caches memory for us (the CUDA +// malloc and free routines are very slow). +// This is a member of the CuDevice class. +class CuMemoryAllocator { + public: + void* Malloc(size_t size); + + void* MallocPitch(size_t row_bytes, size_t num_rows, size_t *pitch); + + void Free(void *ptr); + + + // the maximum amount of memory that was ever allocated in the lifetime of the + // program, in bytes. + size_t MaxMemoryAllocated() const { return max_bytes_allocated_; } + + // memory held in the cache currently, in bytes. + size_t MemoryCached() const { return cur_bytes_allocated_ - cur_bytes_used_; } + + // memory that's cached plus memory that's allocated, in bytes. + size_t MemoryAllocated() const { return cur_bytes_allocated_; } + + void PrintMemoryUsage() const; + + CuMemoryAllocator(CuAllocatorOptions opts); + private: + + void FreeSomeCachedMemory(size_t bytes_to_free); + + // This calls CudaMallocPitch, checks for errors (dies if it has to), and + // returns the result. It's up to the caller to do all the bookkeeping though. + inline void* MallocPitchInternal(size_t row_bytes, size_t num_rows, size_t *pitch); + + typedef std::pair MemoryRequest; // (row_bytes, num_rows). + struct CachedMemoryElement { + void *pointer; // the CUDA memory location that we own + size_t t; // time value when we put this in the cache. + size_t pitch; // pitch of this memory region (c.f. cudaMallocPitch()). + CachedMemoryElement() { } + CachedMemoryElement(void *pointer, size_t t, size_t pitch): + pointer(pointer), t(t), pitch(pitch) { } + }; + + // This class caches a map from MemoryRequest to a list of CachedMemoryElements, + // and gives us access to the least-recently-used element for efficient. + // removal. + // We will have an instance of this class for each power-of-2 of size in + // bytes. This makes it easier to, when we need to delete something, find + // the item for which the (time-since-used * size-in-bytes) is approximately + // greatest. + class MruCache { + public: + size_t LeastRecentTime() const; // t value of least recent CachedMemoryElement (0 + // if empty). + + size_t RemoveLeastRecentlyUsed(); // Remove least-recently-used element + // from cache. Return size in bytes of + // that removed memory region. Crash if + // this was empty. + + // Attempts lookup of the most recently cached element corresponding to + // 'request'. If available, removes it from the cache and puts it to + // 'output', and returns true. Otherwise returns false. + bool Lookup(const MemoryRequest &request, + CachedMemoryElement *output); + + // Inserts this CachedMemoryElement to the list of CachedMemoryElements for this + // MemoryRequest. The time in the CachedMemoryElement is expected to be greater + // than times in previously supplied CachedMemoryElements. + void Insert(const MemoryRequest &request, + const CachedMemoryElement &element); + + struct MemoryRequestHasher { + // input is interpreted as (row_bytes, num_rows). row_bytes will always + // be a multiple of 4, and num_rows will frequently be a multiple of + // powers of 2 also. We need to shift right and add so that there will be + // some action in the lower-order bits. + size_t operator () (const std::pair &p) const { + size_t temp = p.first + 1867 * p.second; + return temp + (temp >> 2) + (temp >> 8); + } + }; + + MruCache() { } + // Define these to make inclusion in std::vector possible, but make them + // fail if called on anything but empty cache objects-- we never resize + // the vector of caches after initializing it. + MruCache &operator = (const MruCache &other); + MruCache(const MruCache &other); + private: + typedef std::list ListType; + typedef std::list::iterator ListIterType; + typedef std::deque > MapValueType; + typedef unordered_map MapType; + // 'list_' contains MemoryRequests with the most recent on the back (where they are added), + // and least recent on the front (where they are removed by RemoveLeastRecentlyUsed, although + // they are also removed from random parts of the list by Lookup(). + // There will in general be duplicates of MemoryRequests in the list, as + // many as there are entries in the MapValueType. + ListType list_; + // 'map_' maps from a MemoryRequest to a queue of (memory-element, + // iterator), with the most-recently-added things at the back; we remove + // things from the front of these queues (oldest) inside + // RemoveLeastRecentlyUsed(), and from the back (newest) in Lookup. + MapType map_; + }; + + + inline MruCache &GetCacheForSize(size_t num_bytes); + + CuAllocatorOptions opts_; + + // indexed by log_2 (amount of memory requested), the caches. + std::vector caches_; + + size_t cur_bytes_allocated_; // number of bytes currently owned by callers or + // cached. + size_t max_bytes_allocated_; // the max over all time, of cur_bytes_allocated_. + size_t cur_bytes_used_; // number of bytes currently owned by callers. + size_t max_bytes_used_; // the max over all time, of cur_bytes_used_. + size_t t_; // time counter, incremented with each call. + size_t num_user_allocations_; // number of times user calls Malloc* + size_t num_system_allocations_; // number of times we call cudaMalloc*. + double tot_time_taken_in_cuda_malloc_; // time in cudaMalloc + double tot_time_taken_in_cuda_malloc_pitch_; // time in cudaMallocPitch + double tot_time_taken_in_cuda_free_; // time in cudaFree + double tot_time_taken_in_malloc_pitch_; // time in this->MallocPitch() + + + // a memory element is 'used' when it is currently possessed by the caller + // (and is not in our cache). + struct UsedMemoryElement { + size_t row_bytes; + size_t num_rows; + size_t pitch; + UsedMemoryElement() { } + UsedMemoryElement(size_t row_bytes, size_t num_rows, size_t pitch): + row_bytes(row_bytes), num_rows(num_rows), pitch(pitch) { } + }; + + struct PointerHasher { + size_t operator() (const void *arg) const { + // the last few bits tend to be very predictable, for alignment reasons (CUDA + // allocation may align on 256 byte or 512 byte boundaries or something similar). + size_t temp = reinterpret_cast(arg); + return (temp >> 4) + (temp >> 9); + } + }; + + // This is a map from memory locations owned by the user, so we can recover + // the information when people call Free() and we add it back into the cache. + unordered_map used_map_; + +}; + + +} // namespace + +#endif // HAVE_CUDA + + +#endif diff --git a/src/cudamatrix/cu-array-inl.h b/src/cudamatrix/cu-array-inl.h index d9e88af36c5..6b9c91be642 100644 --- a/src/cudamatrix/cu-array-inl.h +++ b/src/cudamatrix/cu-array-inl.h @@ -1,6 +1,6 @@ // cudamatrix/cu-array-inl.h -// Copyright 2009-2012 Karel Vesely +// Copyright 2009-2016 Karel Vesely // 2013 Johns Hopkins University (author: Daniel Povey) // See ../../COPYING for clarification regarding multiple authors @@ -23,6 +23,8 @@ #ifndef KALDI_CUDAMATRIX_CU_ARRAY_INL_H_ #define KALDI_CUDAMATRIX_CU_ARRAY_INL_H_ +#include + #if HAVE_CUDA == 1 #include #include "cudamatrix/cu-common.h" @@ -109,6 +111,23 @@ void CuArray::CopyFromVec(const std::vector &src) { } +template +void CuArray::CopyFromArray(const CuArray &src) { + this->Resize(src.Dim(), kUndefined); + if (dim_ == 0) return; +#if HAVE_CUDA == 1 + if (CuDevice::Instantiate().Enabled()) { + Timer tim; + CU_SAFE_CALL(cudaMemcpy(this->data_, src.data_, dim_ * sizeof(T), + cudaMemcpyDeviceToDevice)); + CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); + } else +#endif + { + memcpy(this->data_, src.data_, dim_ * sizeof(T)); + } +} + template void CuArray::CopyToVec(std::vector *dst) const { @@ -119,16 +138,33 @@ void CuArray::CopyToVec(std::vector *dst) const { #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { Timer tim; - CU_SAFE_CALL(cudaMemcpy(&dst->front(), Data(), dim_*sizeof(T), cudaMemcpyDeviceToHost)); + CU_SAFE_CALL(cudaMemcpy(&dst->front(), Data(), dim_ * sizeof(T), cudaMemcpyDeviceToHost)); CuDevice::Instantiate().AccuProfile("CuArray::CopyToVecD2H", tim.Elapsed()); } else #endif { - memcpy(&dst->front(), data_, dim_*sizeof(T)); + memcpy(&dst->front(), data_, dim_ * sizeof(T)); } } +template +void CuArray::CopyToHost(T *dst) const { + if (dim_ == 0) return; + KALDI_ASSERT(dst != NULL); +#if HAVE_CUDA == 1 + if (CuDevice::Instantiate().Enabled()) { + Timer tim; + CU_SAFE_CALL(cudaMemcpy(dst, Data(), dim_ * sizeof(T), cudaMemcpyDeviceToHost)); + CuDevice::Instantiate().AccuProfile("CuArray::CopyToVecD2H", tim.Elapsed()); + } else +#endif + { + memcpy(dst, data_, dim_ * sizeof(T)); + } +} + + template void CuArray::SetZero() { if (dim_ == 0) return; @@ -145,70 +181,89 @@ void CuArray::SetZero() { } - -/** - * Print the vector to stream - */ -template -std::ostream &operator << (std::ostream &out, const CuArray &vec) { - std::vector tmp; - vec.CopyToVec(&tmp); - out << "["; - for(int32 i=0; i +void CuArray::Set(const T &value) { + // This is not implemented yet, we'll do so if it's needed. + KALDI_ERR << "CuArray::Set not implemented yet for this type."; } +// int32 specialization implemented in 'cudamatrix/cu-array.cc', +template<> +void CuArray::Set(const int32 &value); template -inline void CuArray::Set(const T &value) { +void CuArray::Add(const T &value) { // This is not implemented yet, we'll do so if it's needed. - KALDI_ERR << "CuArray::Set not implemented yet for this type."; + KALDI_ERR << "CuArray::Add not implemented yet for this type."; } - +// int32 specialization implemented in 'cudamatrix/cu-array.cc', template<> -inline void CuArray::Set(const int32 &value) { - if (dim_ == 0) return; -#if HAVE_CUDA == 1 - if (CuDevice::Instantiate().Enabled()) { - Timer tim; +void CuArray::Add(const int32 &value); - dim3 dimBlock(CU2DBLOCK); - dim3 dimGrid(n_blocks(Dim(), CU2DBLOCK)); - ::MatrixDim d = { 1, Dim(), Dim() }; - - cudaI32_set_const(dimGrid, dimBlock, data_, value, d); - CU_SAFE_CALL(cudaGetLastError()); +template +inline T CuArray::Min() const { + KALDI_ASSERT(this->Dim() > 0); + Timer tim; + std::vector tmp(Dim()); + CopyToVec(&tmp); + T ans = *std::min_element(tmp.begin(), tmp.end()); +#if HAVE_CUDA == 1 + if (CuDevice::Instantiate().Enabled()) { CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); - } else -#endif - { - for (int32 i = 0; i < dim_; i++) - data_[i] = value; } +#endif + return ans; } -template -void CuArray::CopyFromArray(const CuArray &src) { - this->Resize(src.Dim(), kUndefined); - if (dim_ == 0) return; + +template +inline T CuArray::Max() const { + KALDI_ASSERT(this->Dim() > 0); + Timer tim; + std::vector tmp(Dim()); + CopyToVec(&tmp); + T ans = *std::max_element(tmp.begin(), tmp.end()); #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { - Timer tim; - CU_SAFE_CALL(cudaMemcpy(this->data_, src.data_, dim_ * sizeof(T), - cudaMemcpyDeviceToDevice)); CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); - } else -#endif - { - memcpy(this->data_, src.data_, dim_ * sizeof(T)); } +#endif + return ans; } +template +void CuArray::Read(std::istream& in, bool binary) { + std::vector tmp; + ReadIntegerVector(in, binary, &tmp); + (*this) = tmp; +} + + +template +void CuArray::Write(std::ostream& out, bool binary) const { + std::vector tmp(this->Dim()); + this->CopyToVec(&tmp); + WriteIntegerVector(out, binary, tmp); +} + + +/** + * Print the vector to stream + */ +template +std::ostream &operator << (std::ostream &out, const CuArray &vec) { + std::vector tmp; + vec.CopyToVec(&tmp); + out << "["; + for(int32 i=0; i cu_vec(vec); std::vector vec2; cu_vec.CopyToVec(&vec2); + T *vec22 = new T[vec.size()]; + cu_vec.CopyToHost(vec22); + delete[] vec22; } { // test assignment operator from CuArray. diff --git a/src/cudamatrix/cu-array.cc b/src/cudamatrix/cu-array.cc new file mode 100644 index 00000000000..86313f41292 --- /dev/null +++ b/src/cudamatrix/cu-array.cc @@ -0,0 +1,86 @@ +// cudamatrix/cu-array.cc + +// Copyright 2016 Brno University of Technology (author: Karel Vesely) + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + +#include + +#if HAVE_CUDA == 1 +#include +#endif + +#include "base/timer.h" +#include "cudamatrix/cu-common.h" +#include "cudamatrix/cu-device.h" +#include "cudamatrix/cu-matrixdim.h" +#include "cudamatrix/cu-kernels.h" + +#include "cudamatrix/cu-array.h" + +namespace kaldi { + +template<> +void CuArray::Set(const int32 &value) { + if (dim_ == 0) return; +#if HAVE_CUDA == 1 + if (CuDevice::Instantiate().Enabled()) { + Timer tim; + + dim3 dimBlock(CU2DBLOCK); + dim3 dimGrid(n_blocks(Dim(), CU2DBLOCK)); + ::MatrixDim d = { 1, Dim(), Dim() }; + + cuda_int32_set_const(dimGrid, dimBlock, data_, value, d); + CU_SAFE_CALL(cudaGetLastError()); + + CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); + } else +#endif + { + for (int32 i = 0; i < dim_; i++) { + data_[i] = value; + } + } +} + + +template<> +void CuArray::Add(const int32 &value) { + if (dim_ == 0) return; +#if HAVE_CUDA == 1 + if (CuDevice::Instantiate().Enabled()) { + Timer tim; + + dim3 dimBlock(CU2DBLOCK); + dim3 dimGrid(n_blocks(Dim(), CU2DBLOCK)); + ::MatrixDim d = { 1, Dim(), Dim() }; + + cuda_int32_add(dimGrid, dimBlock, data_, value, d); + CU_SAFE_CALL(cudaGetLastError()); + + CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); + } else +#endif + { + for (int32 i = 0; i < dim_; i++) { + data_[i] += value; + } + } +} + + +} // namespace kaldi diff --git a/src/cudamatrix/cu-array.h b/src/cudamatrix/cu-array.h index 18ea7c2ef11..86672db9b08 100644 --- a/src/cudamatrix/cu-array.h +++ b/src/cudamatrix/cu-array.h @@ -88,6 +88,11 @@ class CuArray { /// objects are more than plain structs. void CopyToVec(std::vector *dst) const; + /// Version of the above function that copies contents to a host array. + /// This function requires *dst to be allocated before calling. The allocated + /// size should be dim_ * sizeof(T) + void CopyToHost(T *dst) const; + /// Sets the memory for the object to zero, via memset. You should verify /// that this makes sense for type T. void SetZero(); @@ -96,6 +101,18 @@ class CuArray { /// assignment operators or destructors are not called. This is NOT IMPLEMENTED /// YET except for T == int32 (the current implementation will just crash). void Set(const T &value); + + /// Add a constant value. This is NOT IMPLEMENTED YET except for T == int32 + /// (the current implementation will just crash). + void Add(const T &value); + + /// Get minimum value (for now implemented on CPU, reimplement if slow). + /// Asserts the vector is non-empty, otherwise crashes. + T Min() const; + + /// Get minimum value (for now implemented on CPU, reimplement if slow). + /// Asserts the vector is non-empty, otherwise crashes. + T Max() const; CuArray &operator= (const CuArray &in) { this->CopyFromArray(in); return *this; @@ -104,6 +121,10 @@ class CuArray { CuArray &operator= (const std::vector &in) { this->CopyFromVec(in); return *this; } + + /// I/O + void Read(std::istream &is, bool binary); + void Write(std::ostream &is, bool binary) const; private: MatrixIndexT dim_; ///< dimension of the vector @@ -115,9 +136,8 @@ class CuArray { /// I/O template std::ostream &operator << (std::ostream &out, const CuArray &vec); - -} // namespace +} // namespace #include "cudamatrix/cu-array-inl.h" diff --git a/src/cudamatrix/cu-block-matrix.cc b/src/cudamatrix/cu-block-matrix.cc index 018a1a2a672..d36b3e31f92 100644 --- a/src/cudamatrix/cu-block-matrix.cc +++ b/src/cudamatrix/cu-block-matrix.cc @@ -20,7 +20,7 @@ #if HAVE_CUDA == 1 #include -#include +#include #endif #include diff --git a/src/cudamatrix/cu-common.cc b/src/cudamatrix/cu-common.cc index cbe6392dbf6..2b23bf0b621 100644 --- a/src/cudamatrix/cu-common.cc +++ b/src/cudamatrix/cu-common.cc @@ -1,6 +1,7 @@ // cudamatrix/cu-common.cc // Copyright 2013 Karel Vesely +// 2015 Johns Hopkins University (author: Daniel Povey) // See ../../COPYING for clarification regarding multiple authors // @@ -23,11 +24,11 @@ // This file contains some #includes, forward declarations // and typedefs that are needed by all the main header // files in this directory. - #include "base/kaldi-common.h" #include "matrix/kaldi-blas.h" #include "cudamatrix/cu-device.h" #include "cudamatrix/cu-common.h" +#include "cudamatrix/cu-matrixdim.h" namespace kaldi { @@ -43,6 +44,29 @@ cublasOperation_t KaldiTransToCuTrans(MatrixTransposeType kaldi_trans) { cublas_trans = CUBLAS_OP_C; return cublas_trans; } + +void GetBlockSizesForSimpleMatrixOperation(int32 num_rows, + int32 num_cols, + dim3 *dimGrid, + dim3 *dimBlock) { + KALDI_ASSERT(num_rows > 0 && num_cols > 0); + int32 col_blocksize = 64, row_blocksize = 4; + while (col_blocksize > 1 && + (num_cols + (num_cols / 2) <= col_blocksize || + num_rows > 65536 * row_blocksize)) { + col_blocksize /= 2; + row_blocksize *= 2; + } + + dimBlock->x = col_blocksize; + dimBlock->y = row_blocksize; + dimBlock->z = 1; + dimGrid->x = n_blocks(num_cols, col_blocksize); + dimGrid->y = n_blocks(num_rows, row_blocksize); + KALDI_ASSERT(dimGrid->y <= 65536 && + "Matrix has too many rows to process"); + dimGrid->z = 1; +} #endif } // namespace diff --git a/src/cudamatrix/cu-common.h b/src/cudamatrix/cu-common.h index 7530d5c8627..eadf963e2c8 100644 --- a/src/cudamatrix/cu-common.h +++ b/src/cudamatrix/cu-common.h @@ -30,7 +30,7 @@ #include "matrix/matrix-common.h" #if HAVE_CUDA == 1 -#include +#include #include @@ -41,26 +41,42 @@ if ((ret = (fun)) != 0) { \ KALDI_ERR << "cudaError_t " << ret << " : \"" << cudaGetErrorString((cudaError_t)ret) << "\" returned from '" << #fun << "'"; \ } \ - cudaThreadSynchronize(); \ -} + cudaDeviceSynchronize(); \ +} #define KALDI_CUDA_ERR(ret, msg) \ { \ if (ret != 0) { \ KALDI_ERR << msg << ", diagnostics: cudaError_t " << ret << " : \"" << cudaGetErrorString((cudaError_t)ret) << "\", in " << __FILE__ << ":" << __LINE__; \ } \ - cudaThreadSynchronize(); \ -} + cudaDeviceSynchronize(); \ +} namespace kaldi { /** Number of blocks in which the task of size 'size' is splitted **/ -inline int32 n_blocks(int32 size, int32 block_size) { - return size / block_size + ((size % block_size == 0)? 0 : 1); +inline int32 n_blocks(int32 size, int32 block_size) { + return size / block_size + ((size % block_size == 0)? 0 : 1); } cublasOperation_t KaldiTransToCuTrans(MatrixTransposeType kaldi_trans); - + + +/* + This function gives you suitable dimBlock and dimGrid sizes for a simple + matrix operation (one that applies to each element of the matrix. The x + indexes will be interpreted as column indexes, and the y indexes will be + interpreted as row indexes; this is based on our interpretation of a matrix as + being row-major, i.e. having column-stride = 1, not based on CuBLAS's + opposite interpretation. There is a good reason for associating the column + index with x and not y; this helps memory locality in adjacent kernels. + */ +void GetBlockSizesForSimpleMatrixOperation(int32 num_rows, + int32 num_cols, + dim3 *dimGrid, + dim3 *dimBlock); + + } #endif // HAVE_CUDA diff --git a/src/cudamatrix/cu-device-test.cc b/src/cudamatrix/cu-device-test.cc new file mode 100644 index 00000000000..716c1c24d4c --- /dev/null +++ b/src/cudamatrix/cu-device-test.cc @@ -0,0 +1,125 @@ +// cudamatrix/cu-device-test.cc + +// Copyright 2015 Johns Hopkins University (author: Daniel Povey) + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + + +#include +#include +#include + +#include "base/kaldi-common.h" +#include "util/common-utils.h" +#include "cudamatrix/cu-matrix.h" +#include "cudamatrix/cu-vector.h" + +using namespace kaldi; + + +namespace kaldi { + + +template +std::string NameOf() { + return (sizeof(Real) == 8 ? "" : ""); +} + +template void TestCuMatrixResize(int32 size_multiple) { + int32 num_matrices = 256; + BaseFloat time_in_secs = 0.2; + + std::vector > sizes(num_matrices); + + for (int32 i = 0; i < num_matrices; i++) { + int32 num_rows = RandInt(1, 10); + num_rows *= num_rows; + num_rows *= size_multiple; + int32 num_cols = RandInt(1, 10); + num_cols *= num_cols; + num_cols *= size_multiple; + sizes[i].first = num_rows; + sizes[i].second = num_rows; + } + + std::vector > matrices(num_matrices); + + Timer tim; + size_t num_floats_processed = 0; + for (;tim.Elapsed() < time_in_secs; ) { + int32 matrix = RandInt(0, num_matrices - 1); + if (matrices[matrix].NumRows() == 0) { + int32 num_rows = sizes[matrix].first, + num_cols = sizes[matrix].second; + matrices[matrix].Resize(num_rows, num_cols, kUndefined); + num_floats_processed += num_rows * num_cols; + } else { + matrices[matrix].Resize(0, 0); + } + } + + BaseFloat gflops = num_floats_processed / (tim.Elapsed() * 1.0e+09); + + KALDI_LOG << "For CuMatrix::Resize" << NameOf() << ", for size_multiple = " + << size_multiple << ", speed was " << gflops << " gigaflops."; +} + +template +void CudaMatrixResizeTest() { + std::vector sizes; + sizes.push_back(1); + sizes.push_back(2); + sizes.push_back(4); + sizes.push_back(8); + sizes.push_back(16); + //sizes.push_back(24); + //sizes.push_back(32); + //sizes.push_back(40); + + int32 ns = sizes.size(); + for (int32 s = 0; s < ns; s++) + TestCuMatrixResize(sizes[s]); +} + + +} // namespace kaldi + + +int main() { + for (int32 loop = 0; loop < 2; loop++) { +#if HAVE_CUDA == 1 + if (loop == 0) + CuDevice::Instantiate().SelectGpuId("no"); + else + CuDevice::Instantiate().SelectGpuId("yes"); +#endif + + kaldi::CudaMatrixResizeTest(); +#if HAVE_CUDA == 1 + if (CuDevice::Instantiate().DoublePrecisionSupported()) { + kaldi::CudaMatrixResizeTest(); + } else { + KALDI_WARN << "Double precision not supported"; + } +#else + kaldi::CudaMatrixResizeTest(); +#endif + } +#if HAVE_CUDA == 1 + CuDevice::Instantiate().PrintProfile(); +#endif + std::cout << "Tests succeeded.\n"; +} diff --git a/src/cudamatrix/cu-device.cc b/src/cudamatrix/cu-device.cc index 5246dfd2cb7..c34994ed6ce 100644 --- a/src/cudamatrix/cu-device.cc +++ b/src/cudamatrix/cu-device.cc @@ -2,7 +2,7 @@ // Copyright 2009-2012 Karel Vesely // 2013 Lucas Ondel -// 2013 Johns Hopkins University (author: Daniel Povey) +// 2013-2015 Johns Hopkins University (author: Daniel Povey) // 2015 Guoguo Chen // See ../../COPYING for clarification regarding multiple authors @@ -24,7 +24,7 @@ #if HAVE_CUDA == 1 -#include +#include #include #include @@ -41,10 +41,10 @@ #include "base/kaldi-error.h" #include "base/kaldi-utils.h" #include "util/common-utils.h" +#include "util/kaldi-io.h" namespace kaldi { - /** This function was added by Dan in July 2015 after upgrading on the CLSP cluster to the CUDA 7.0 toolkit; the old mechanism of just calling @@ -55,22 +55,26 @@ namespace kaldi { changed feature (the NVidia docs were never super-clear regarding device initialization). But regardless, changing to this new mechanism should be harmless even if the problem was specific to the CLSP grid. - */ +*/ -static bool GetCudaContext(int32 num_gpus) { - cudaError_t e; +static bool GetCudaContext(int32 num_gpus, std::string *debug_str) { + std::ostringstream debug_stream; + debug_stream << "num-gpus=" << num_gpus << ". "; for (int32 device = 0; device < num_gpus; device++) { cudaSetDevice(device); - e = cudaDeviceSynchronize(); // << CUDA context gets created here. - cudaGetLastError(); // reset the error state + cudaError_t e = cudaDeviceSynchronize(); // << CUDA context gets created here. if (e == cudaSuccess) { + *debug_str = debug_stream.str(); return true; } + debug_stream << "Device " << device << ": " << cudaGetErrorString(e) << ". "; + cudaGetLastError(); // Make sure the error state doesn't get returned in + // the next cudaGetLastError(). } + *debug_str = debug_stream.str(); return false; } - /** * SelectGpuId(use_gpu) * @@ -102,7 +106,7 @@ void CuDevice::SelectGpuId(std::string use_gpu) { << ", cannot change it on the fly!"; } // Allow the GPU to stay disabled - if(!Enabled() && use_gpu == "no") { + if (!Enabled() && use_gpu == "no") { KALDI_LOG << "Manually selected to compute on CPU."; return; } @@ -110,8 +114,7 @@ void CuDevice::SelectGpuId(std::string use_gpu) { // Check that we have a gpu available int32 num_gpus = 0; - cudaError_t e; - e = cudaGetDeviceCount(&num_gpus); + cudaError_t e = cudaGetDeviceCount(&num_gpus); if (num_gpus == 0) { if (use_gpu == "yes" || use_gpu == "wait") { @@ -124,18 +127,24 @@ void CuDevice::SelectGpuId(std::string use_gpu) { } // Create a CUDA context. - bool got_context = GetCudaContext(num_gpus); + std::string debug_str; + bool got_context = GetCudaContext(num_gpus, &debug_str); if (use_gpu != "wait") { if (!got_context) { // So far no we don't have context, sleep a bit and retry. int32 sec_sleep = (use_gpu == "yes" ? 20 : 2); KALDI_WARN << "Will try again to get a GPU after " << sec_sleep - << " seconds."; + << " seconds."; Sleep(sec_sleep); - if (! GetCudaContext(num_gpus)) { + if (!GetCudaContext(num_gpus, &debug_str)) { if (use_gpu == "yes") { - KALDI_CUDA_ERR(e, "Failed to create CUDA context, no more unused GPUs?"); + { + Input input; + input.Open("nvidia-smi 1>&2 |"); + } + KALDI_LOG << debug_str; + KALDI_ERR << "Failed to create CUDA context, no more unused GPUs? "; } if (use_gpu == "optional") { KALDI_WARN << "Running on CPU!!! No more unused CUDA GPUs?"; @@ -154,7 +163,7 @@ void CuDevice::SelectGpuId(std::string use_gpu) { num_times++; wait_time += sec_sleep; Sleep(sec_sleep); - got_context = GetCudaContext(num_gpus); + got_context = GetCudaContext(num_gpus, &debug_str); } KALDI_WARN << "Waited " << wait_time @@ -170,7 +179,7 @@ void CuDevice::SelectGpuId(std::string use_gpu) { return; } else { // Or suggest to use compute exclusive mode - if(num_gpus > 1) { + if (num_gpus > 1) { KALDI_WARN << "Suggestion: use 'nvidia-smi -c 1' to set compute exclusive mode"; } // And select the GPU according to proportion of free memory @@ -199,15 +208,14 @@ void CuDevice::FinalizeActiveGpu() { // Get the device-id of active device: { int32 act_gpu_id; - cudaError_t e; - e = cudaGetDevice(&act_gpu_id); - if(e != cudaSuccess) { + cudaError_t e = cudaGetDevice(&act_gpu_id); + if (e != cudaSuccess) { KALDI_CUDA_ERR(e, "Failed to get device-id of active device."); } // Remember the id of active GPU active_gpu_id_ = act_gpu_id; // CuDevice::Enabled() is true from now on // Initialize the CUBLAS - CU_SAFE_CALL(cublasInit()); + CU_SAFE_CALL(cublasCreate(&handle_)); // Notify user which GPU is finally used char name[128]; @@ -218,8 +226,6 @@ void CuDevice::FinalizeActiveGpu() { KALDI_LOG << "The active GPU is [" << act_gpu_id << "]: " << name << "\t" << GetFreeMemory(&free_memory_at_startup_, NULL) << " version " << properties_.major << "." << properties_.minor; - - if (verbose_) PrintMemoryUsage(); } return; } @@ -239,12 +245,12 @@ bool CuDevice::IsComputeExclusive() { // get the device-id and its device-properties int32 gpu_id = -1; cudaError_t e = cudaGetDevice(&gpu_id); - if(e != cudaSuccess) { + if (e != cudaSuccess) { KALDI_CUDA_ERR(e, "Failed to get current device"); } struct cudaDeviceProp gpu_prop; e = cudaGetDeviceProperties(&gpu_prop, gpu_id); - if(e != cudaSuccess) { + if (e != cudaSuccess) { KALDI_CUDA_ERR(e, "Failed to get device properties"); } // find out whether compute exclusive mode is used @@ -263,7 +269,7 @@ bool CuDevice::IsComputeExclusive() { // The computation mode is not compute-exclusive, // in this case we release the GPU context... e = cudaThreadExit(); // deprecated, but for legacy reason not cudaDeviceReset - if(e != cudaSuccess) { + if (e != cudaSuccess) { KALDI_CUDA_ERR(e, "Failed to release CUDA context on a GPU"); } return false; @@ -277,14 +283,13 @@ bool greater_pair(const std::pair &left, const std::pair& right) bool CuDevice::SelectGpuIdAuto() { // Check that we have at least one gpu - cudaError_t e; int32 num_gpus = 0; - e = cudaGetDeviceCount(&num_gpus); - if(num_gpus == 0) { + cudaError_t e = cudaGetDeviceCount(&num_gpus); + if (num_gpus == 0) { KALDI_WARN << "No CUDA devices found"; if (e != cudaSuccess) { KALDI_WARN << "cudaGetDeviceCount() returned " << e - <<", meaning: \"" << cudaGetErrorString(e) << "\""; + <<", meaning: \"" << cudaGetErrorString(e) << "\""; } return false; } @@ -343,7 +348,7 @@ bool CuDevice::SelectGpuIdAuto() { // find GPU with max free memory int32 max_id=0; std::sort(free_mem_ratio.begin(), free_mem_ratio.end(), - greater_pair); + greater_pair); // the free_mem_ratio should be bigger than zero KALDI_ASSERT(free_mem_ratio[max_id].second > 0.0); @@ -359,14 +364,14 @@ bool CuDevice::SelectGpuIdAuto() { KALDI_LOG << "Trying to select device: " << dev_id << " (automatically), mem_ratio: " << mem_ratio; e = cudaSetDevice(dev_id); - if(e != cudaSuccess) { + if (e != cudaSuccess) { KALDI_WARN << "Cannot select this device: return code " << e - << ", Error message: \"" << cudaGetErrorString(e) << "\""; + << ", Error message: \"" << cudaGetErrorString(e) << "\""; } else { e = cudaThreadSynchronize(); // deprecated, but for legacy not cudaDeviceSynchronize - if(e != cudaSuccess) { + if (e != cudaSuccess) { KALDI_WARN << "Cannot select this device: return code " << e - << ", Error message: \"" << cudaGetErrorString(e) << "\""; + << ", Error message: \"" << cudaGetErrorString(e) << "\""; } } max_id++; @@ -390,9 +395,11 @@ void CuDevice::AccuProfile(const std::string &key, double time) { void CuDevice::PrintMemoryUsage() const { if (Enabled()) { + allocator_.PrintMemoryUsage(); int64 free_memory_now; GetFreeMemory(&free_memory_now, NULL); - KALDI_LOG << "Memory used: " << (free_memory_at_startup_ - free_memory_now) << " bytes."; + KALDI_LOG << "Memory used (according to the device): " + << (free_memory_at_startup_ - free_memory_now) << " bytes."; } } @@ -400,7 +407,7 @@ void CuDevice::PrintProfile() { if (verbose_ && Enabled()) { std::ostringstream os; os << "-----\n[cudevice profile]\n"; - std::map::iterator it; + unordered_map::iterator it; std::vector > pairs; double total_time = 0.0; for(it = profile_map_.begin(); it != profile_map_.end(); ++it) { @@ -425,10 +432,10 @@ void CuDevice::PrintProfile() { std::string CuDevice::GetFreeMemory(int64* free, int64* total) const { -// WARNING! the CUDA API is inconsistent accross versions! + // WARNING! the CUDA API is inconsistent accross versions! #ifdef _MSC_VER - size_t mem_free, mem_total; - cuMemGetInfo_v2(&mem_free, &mem_total); + size_t mem_free, mem_total; + cuMemGetInfo_v2(&mem_free, &mem_total); #else #if (CUDA_VERSION >= 3020) // define the function signature type @@ -437,16 +444,12 @@ std::string CuDevice::GetFreeMemory(int64* free, int64* total) const { unsigned int mem_free, mem_total; #endif { - // we will load the cuMemGetInfo dynamically from libcuda.so - // cuMemGetInfo(&mem_free, &mem_total); + // we will load cuMemGetInfo_v2 dynamically from libcuda.so // pre-fill ``safe'' values that will not cause problems mem_free = 1; mem_total = 1; -#ifdef _MSC_VER - cuMemGetInfo_v2(&mem_free, &mem_total); -#else // open libcuda.so void* libcuda = dlopen("libcuda.so",RTLD_LAZY); - if(NULL == libcuda) { + if (NULL == libcuda) { KALDI_WARN << "cannot open libcuda.so"; } else { // define the function signature type @@ -458,7 +461,7 @@ std::string CuDevice::GetFreeMemory(int64* free, int64* total) const { typedef CUresult (*cu_fun_ptr)(int*, int*); cu_fun_ptr dl_cuMemGetInfo = (cu_fun_ptr)dlsym(libcuda,"cuMemGetInfo"); #endif - if(NULL == dl_cuMemGetInfo) { + if (NULL == dl_cuMemGetInfo) { KALDI_WARN << "cannot load cuMemGetInfo from libcuda.so"; } else { // call the function @@ -467,12 +470,11 @@ std::string CuDevice::GetFreeMemory(int64* free, int64* total) const { // close the library dlclose(libcuda); } -#endif } #endif // copy the output values outside - if(NULL != free) *free = mem_free; - if(NULL != total) *total = mem_total; + if (NULL != free) *free = mem_free; + if (NULL != total) *total = mem_total; // prepare the text output std::ostringstream os; os << "free:" << mem_free/(1024*1024) << "M, " @@ -491,14 +493,14 @@ void CuDevice::DeviceGetName(char* name, int32 len, int32 dev) { #else // open libcuda.so void* libcuda = dlopen("libcuda.so",RTLD_LAZY); - if(NULL == libcuda) { + if (NULL == libcuda) { KALDI_WARN << "cannot open libcuda.so"; } else { // define the function signature type typedef CUresult (*cu_fun_ptr)(char*,int,CUdevice); // get the symbol cu_fun_ptr cuDeviceGetName_ptr = (cu_fun_ptr)dlsym(libcuda,"cuDeviceGetName"); - if(NULL == cuDeviceGetName_ptr) { + if (NULL == cuDeviceGetName_ptr) { KALDI_WARN << "cannot load cuDeviceGetName from libcuda.so"; } else { // call the function @@ -512,7 +514,7 @@ void CuDevice::DeviceGetName(char* name, int32 len, int32 dev) { void CuDevice::CheckGpuHealth() { - if(!Enabled()) return; + if (!Enabled()) return; Timer t; // prepare small matrices for a quick test Matrix a(50, 100); @@ -532,47 +534,48 @@ void CuDevice::CheckGpuHealth() { } -void CuDevice::Free(void *ptr) { +/* + void CuDevice::Free(void *ptr) { CU_SAFE_CALL(cudaFree(ptr)); -} + } -void* CuDevice::MallocPitch(size_t row_bytes, size_t num_rows, size_t *pitch) { - void *ret_ptr = NULL; - cudaError_t e = cudaMallocPitch(&ret_ptr, pitch, row_bytes, num_rows); + void* CuDevice::MallocPitch(size_t row_bytes, size_t num_rows, size_t *pitch) { + void *ans = NULL; + cudaError_t e = cudaMallocPitch(&ans, pitch, row_bytes, num_rows); if (e != cudaSuccess) { - PrintMemoryUsage(); - KALDI_ERR << "CuDevice::MallocPitch: cannot allocate the requested memory (" - << row_bytes << " x " << num_rows << " = " - << row_bytes * num_rows << " bytes )"; + PrintMemoryUsage(); + KALDI_ERR << "CuDevice::MallocPitch: cannot allocate the requested memory (" + << row_bytes << " x " << num_rows << " = " + << row_bytes * num_rows << " bytes )"; + } + return ans; } - return ret_ptr; -} -void* CuDevice::Malloc(size_t size) { - void *ret_ptr = NULL; - cudaError_t e = cudaMalloc(&ret_ptr, size); + void* CuDevice::Malloc(size_t size) { + void *ans = NULL; + cudaError_t e = cudaMalloc(&ans, size); if (e != cudaSuccess) { - PrintMemoryUsage(); - KALDI_ERR << "CuDevice::Malloc: cannot allocate the requested memory" - << " (" << size << " bytes )"; + PrintMemoryUsage(); + KALDI_ERR << "CuDevice::Malloc: cannot allocate the requested memory" + << " (" << size << " bytes )"; } - return ret_ptr; -} + return ans; + } +*/ -CuDevice::CuDevice(): active_gpu_id_(-1), verbose_(true) - { } +CuDevice::CuDevice(): active_gpu_id_(-1), verbose_(true), + allocator_(CuAllocatorOptions()) { } CuDevice::~CuDevice() { if (Enabled()) { - cublasShutdown(); + cublasDestroy(handle_); + cudaDeviceReset(); } } // The instance of the static singleton CuDevice CuDevice::global_device_; - - } diff --git a/src/cudamatrix/cu-device.h b/src/cudamatrix/cu-device.h index 5858fc2d84e..ddf275a73e8 100644 --- a/src/cudamatrix/cu-device.h +++ b/src/cudamatrix/cu-device.h @@ -1,6 +1,7 @@ // cudamatrix/cu-device.h // Copyright 2009-2012 Karel Vesely +// 2012-2015 Johns Hopkins University (author: Daniel Povey) // See ../../COPYING for clarification regarding multiple authors // @@ -24,18 +25,19 @@ #if HAVE_CUDA == 1 +#include #include #include #include #include #include #include "base/kaldi-common.h" +#include "cudamatrix/cu-allocator.h" namespace kaldi { - /** - * Singleton object which represents CUDA device + * Singleton object which represents the CUDA device * responsible for CUBLAS initilalisation, collects profiling info */ class CuDevice { @@ -44,26 +46,29 @@ class CuDevice { ~CuDevice(); static inline CuDevice& Instantiate() { return global_device_; } + inline cublasHandle_t GetHandle() { return handle_; } + // We provide functions Malloc, MallocPitch and Free which replace cudaMalloc, // cudaMallocPitch and cudaFree. Their function is to cache the results of // previous allocations to avoid the very large overhead that CUDA's // allocation seems to give for some setups. - void* Malloc(size_t size); - - void* MallocPitch(size_t row_bytes, size_t num_rows, size_t *pitch); - - void Free(void *ptr); + inline void* Malloc(size_t size) { return allocator_.Malloc(size); } + + inline void* MallocPitch(size_t row_bytes, size_t num_rows, size_t *pitch) { + return allocator_.MallocPitch(row_bytes, num_rows, pitch); + } + inline void Free(void *ptr) { allocator_.Free(ptr); } /// Select a GPU for computation, the 'use_gpu' modes are: /// "yes" -- Select GPU automatically and die if this fails. - /// "optional" -- Do as above, but if it fails, back off to CPU. - /// "no" -- Run on CPU. + /// "optional" -- Do as above, but if it fails, back off to CPU. + /// "no" -- Run on CPU. /// (more comments in cu-device.cc) void SelectGpuId(std::string use_gpu); /// Check if the CUDA GPU is selected for use bool Enabled() const { - return (active_gpu_id_ > -1); + return (active_gpu_id_ > -1); } /// Get the active GPU id @@ -79,18 +84,18 @@ class CuDevice { /// Sum the IO time void AccuProfile(const std::string &key, double time); - void PrintProfile(); + void PrintProfile(); void PrintMemoryUsage() const; - - void ResetProfile() { - profile_map_.clear(); + + void ResetProfile() { + profile_map_.clear(); } - + /// Get the actual GPU memory use stats std::string GetFreeMemory(int64* free = NULL, int64* total = NULL) const; /// Get the name of the GPU - void DeviceGetName(char* name, int32 len, int32 dev); + void DeviceGetName(char* name, int32 len, int32 dev); /// Check if GPU is in good condition by multiplying small matrices on GPU+CPU. /// Overheated GPUs may give inaccurate results, which we want to detect. @@ -100,14 +105,16 @@ class CuDevice { /// will always be a multiple of n (from properties_.textureAlignment). /// Otherwise, return 16, which is the stride used for CPU matrices. int32 GetMatrixAlignment() const; - + private: CuDevice(); CuDevice(CuDevice&); // Disallow. CuDevice &operator=(CuDevice&); // Disallow. + static CuDevice global_device_; - + cublasHandle_t handle_; + /// Check if the GPU run in compute exclusive mode Returns true if it is /// running in compute exclusive mode and we have a GPU. Returns false /// otherwise. Sets error to true if there was some error, such as that we @@ -122,31 +129,35 @@ class CuDevice { bool SelectGpuIdManual(int32 gpu_id); void FinalizeActiveGpu(); - - /// Should only be called if Enabled() == true. + + /// Should only be called if Enabled() == true. int32 MajorDeviceVersion(); - /// Should only be called if Enabled() == true. + /// Should only be called if Enabled() == true. int32 MinorDeviceVersion(); - std::map profile_map_; - + unordered_map profile_map_; + /// active_gpu_id_ values: /// -3 default (default, the SelectGpuId was not called, we did not want to use GPU) /// -2 SelectGpuId was called, but no GPU was present /// -1 SelectGpuId was called, but the GPU was manually disabled /// 0..N Normal GPU IDs - int32 active_gpu_id_; - + int32 active_gpu_id_; + int64 free_memory_at_startup_; - + cudaDeviceProp properties_; bool verbose_; - + CuMemoryAllocator allocator_; + }; // class CuDevice +// This function is declared as a more convenient way to get the CUDA device handle for use +// in the CUBLAS v2 API, since we so frequently need to access it. +inline cublasHandle_t GetCublasHandle() { return CuDevice::Instantiate().GetHandle(); } } // namespace diff --git a/src/cudamatrix/cu-kernels-ansi.h b/src/cudamatrix/cu-kernels-ansi.h index caae069da9e..bb909b47c32 100644 --- a/src/cudamatrix/cu-kernels-ansi.h +++ b/src/cudamatrix/cu-kernels-ansi.h @@ -2,7 +2,7 @@ // Copyright 2009-2012 Karel Vesely // 2013 Johns Hopkins University (author: Daniel Povey) -// 2013 Hainan Xu +// 2013 Hainan Xu // 2013 Xiaohui Zhang // 2013-2015 Guoguo Chen @@ -35,7 +35,8 @@ extern "C" { /********************************************************* * int32 CUDA kernel calls (no template wrapper) */ -void cudaI32_set_const(dim3 Gr, dim3 Bl, int32_cuda *mat, int32_cuda value, MatrixDim d); +void cuda_int32_set_const(dim3 Gr, dim3 Bl, int32_cuda *mat, int32_cuda value, MatrixDim d); +void cuda_int32_add(dim3 Gr, dim3 Bl, int32_cuda *mat, int32_cuda value, MatrixDim d); @@ -44,7 +45,7 @@ void cudaI32_set_const(dim3 Gr, dim3 Bl, int32_cuda *mat, int32_cuda value, Matr */ /* - * CuMatrix + * CuMatrix */ void cudaF_copy_upp_low(dim3 Gr, dim3 Bl, float* A, MatrixDim dimA); void cudaF_copy_low_upp(dim3 Gr, dim3 Bl, float* A, MatrixDim dimA); @@ -55,11 +56,10 @@ void cudaF_copy_from_tp_trans(dim3 Gr, dim3 Bl, float* A, const float* B, Matrix void cudaFD_copy_from_tp_trans(dim3 Gr, dim3 Bl, float* A, const double* B, MatrixDim dmat); void cudaF_copy_from_tp(dim3 Gr, dim3 Bl, float* A, const float* B, MatrixDim dmat); void cudaFD_copy_from_tp(dim3 Gr, dim3 Bl, float* A, const double* B, MatrixDim dmat); -void cudaF_copy_col_from_vec(int Gr, int Bl, float* mat, const float* v, int col, MatrixDim d); void cudaF_apply_exp(dim3 Gr, dim3 Bl, float* mat, MatrixDim d); void cudaF_apply_pow(dim3 Gr, dim3 Bl, float* mat, float power, MatrixDim d); void cudaF_apply_pow_abs(dim3 Gr, dim3 Bl, float* mat, float power, bool include_sign, MatrixDim d); -void cudaF_apply_heaviside(dim3 Gr, dim3 Bl, float* mat, MatrixDim d); +void cudaF_apply_heaviside(dim3 Gr, dim3 Bl, float* mat, MatrixDim d); void cudaF_apply_floor(dim3 Gr, dim3 Bl, float* mat, float floor_val, MatrixDim d); void cudaF_copy_cols(dim3 Gr, dim3 Bl, float* dst, const float* src, const MatrixIndexT_cuda* reorder, MatrixDim dst_dim, int src_stride); void cudaF_add_cols(dim3 Gr, dim3 Bl, float* dst, const float* src, const MatrixIndexT_cuda* reorder, MatrixDim dst_dim, int src_stride); @@ -90,6 +90,7 @@ void cudaF_calc_pnorm_deriv(dim3 Gr, dim3 Bl, float *y, const float *x1, const f void cudaF_calc_group_max_deriv(dim3 Gr, dim3 Bl, float *y, const float *x1, const float *x2, MatrixDim d, int src_stride, int group_size); void cudaF_div_rows_vec(dim3 Gr, dim3 Bl, float *mat, const float *vec_div, MatrixDim d); void cudaF_add_mat(dim3 Gr, dim3 Bl, float alpha, const float *src, float *dst, MatrixDim d, int src_stride, int A_trans); +void cudaF_add_mat_blocks(dim3 Gr, dim3 Bl, float alpha, const float *src, int32_cuda num_row_blocks, int32_cuda num_col_blocks, float *dst, MatrixDim d, int src_stride, int A_trans); void cudaF_add_mat_mat_div_mat(dim3 Gr, dim3 Bl, const float *A, const float *B, const float *C, float *dst, MatrixDim d, int stride_a, int stride_b, int stride_c); void cudaF_add_vec_to_cols(dim3 Gr, dim3 Bl, float alpha, const float *col, float beta, float *dst, MatrixDim d); void cudaF_add_vec_to_rows(dim3 Gr, dim3 Bl, float alpha, const float *row, float beta, float *dst, MatrixDim d); @@ -106,19 +107,19 @@ void cudaF_vec_mul_elements(int Gr, int Bl, float* v, const float* a, int dim); void cudaF_vec_soft_max(int Gr, int Bl, float* v, int dim); void cudaF_vec_min(const float* v, float* value, int dim); void cudaF_vec_max(const float* v, float* value, int dim); -void cudaF_trace_mat_mat_trans(const float* A, const float* B, MatrixDim dA, int B_stride, float* value); -void cudaF_trace_mat_mat(const float* A, const float* B, MatrixDim dA, int B_stride, float* value); -void cudaF_add_diag_mat_mat(int Gr, int Bl, float alpha, float* v, int v_dim, const float* M, - int M_cols, int M_row_stride, int M_col_stride, const float *N, int N_row_stride, - int N_col_stride, int threads_per_element, float beta); +void cudaF_trace_mat_mat_trans(dim3 Gr, dim3 Bl, const float* A, const float* B, MatrixDim dA, int B_stride, float* value); +void cudaF_trace_mat_mat(dim3 Gr, dim3 Bl, const float* A, const float* B, MatrixDim dA, int B_stride, float* value); +void cudaF_add_diag_mat_mat(int Gr, int Bl, float alpha, float* v, int v_dim, const float* M, + int M_cols, int M_row_stride, int M_col_stride, const float *N, int N_row_stride, + int N_col_stride, int threads_per_element, float beta); void cudaF_add_vec_vec(int Gr, int Bl, float alpha, float* v, const float* x, const float* y, float beta, int dim); -void cudaF_copy_col_from_mat(int Gr, int Bl, float* v, int col, const float* mat, MatrixDim dmat, int dim); void cudaF_copy_col_from_mat_df(int Gr, int Bl, double* v, int col, const float* mat, MatrixDim dmat, int dim); void cudaF_copy_col_from_mat_fd(int Gr, int Bl, float* v, int col, const float* mat, MatrixDim dmat, int dim); void cudaF_vec_sum(int Gr, int Bl, float* v, float* value, int dim, int inc); void cudaF_pvec_sum(int Gr, int Bl, float* vec, float* pvec_sum, int dim, int size); void cudaF_vec_copy_diag_from_packed(int Gr, int Bl, float *dst, const float *src, int dim); void cudaF_vec_apply_floor(int Gr, int Bl, float* v, float floor_val, float* num, int dim); +void cudaF_vec_apply_ceiling(int Gr, int Bl, float* v, float ceiling_val, float* num, int dim); void cudaF_vec_apply_exp(int Gr, int Bl, float* v, int dim); void cudaF_vec_apply_log(int Gr, int Bl, float* v, float* flag, int dim); void cudaF_trace(int Gr, int Bl, float* mat, float* value, int dim); @@ -141,12 +142,13 @@ void cudaF_soft_hinge(dim3 Gr, dim3 Bl, float *y, const float *x, MatrixDim d, i void cudaF_group_pnorm(dim3 Gr, dim3 Bl, float *y, const float *x, MatrixDim d, int src_stride, int group_size, float power); void cudaF_group_max(dim3 Gr, dim3 Bl, float *y, const float *x, MatrixDim d, int src_stride, int group_size); void cudaF_sigmoid(dim3 Gr, dim3 Bl, float *y, const float *x, MatrixDim d, int src_stride); +void cudaF_heaviside(dim3 Gr, dim3 Bl, float *y, const float *x, MatrixDim d, int src_stride); void cudaF_diff_sigmoid(dim3 Gr, dim3 Bl, float *eout, const float *e, const float *y, MatrixDim d, int e_stride, int y_stride); void cudaF_tanh(dim3 Gr, dim3 Bl, float *y, const float *x, MatrixDim d, int src_stride); void cudaF_diff_tanh(dim3 Gr, dim3 Bl, float *eout, const float *e, const float *y, MatrixDim d, int e_stride, int y_stride); void cudaF_regularize_l1(dim3 Gr, dim3 Bl, float *wei, float *grad, float l1, float lr, MatrixDim d, int stride_grad); -void cudaF_find_row_max_id(dim3 Gr, dim3 Bl, const float *mat, float *vec_val, int32_cuda *vec_id, int32_cuda voff, MatrixDim d); +void cudaF_find_row_max_id(dim3 Gr, dim3 Bl, const float *mat, float *vec_val, int32_cuda *vec_id, MatrixDim d); void cudaF_diff_xent(dim3 Gr, dim3 Bl, const int32_cuda *vec_tgt, float *mat_net_out, float *vec_log_post, MatrixDim d); void cudaF_copy_rows_from_vec(dim3 Gr, dim3 Bl, float *mat_out, MatrixDim d_out, const float *v_in); @@ -158,17 +160,18 @@ void cudaF_copy_from_sp(dim3 Gr, dim3 Bl, const float* x, float* y, MatrixDim d_ void cudaF_take_lower(dim3 Gr, dim3 Bl, const float* x, float* y, MatrixDim d_in); void cudaF_take_upper(dim3 Gr, dim3 Bl, const float* x, float* y, MatrixDim d_in); void cudaF_take_mean(dim3 Gr, dim3 Bl, const float* x, float* y, MatrixDim d_in); -void cudaF_matrix_add_elements(dim3 Gr, dim3 Bl, float *data, MatrixDim dim, float alpha, MatrixElement* x, int s); +void cudaF_matrix_add_elements(dim3 Gr, dim3 Bl, float *data, MatrixDim dim, float alpha, MatrixElement* x, int num_elements); +void cudaF_matrix_add_indexed_values(dim3 Gr, dim3 Bl, MatrixDim dim, float alpha, const Int32Pair* indices, const float* x, int s, float* data); void cudaF_comp_obj_deriv(dim3 Gr,dim3 Bl, MatrixElement* x, int s, const float* z, MatrixDim d, float* z2, MatrixDim d2, float* t); -void cudaF_transpose_matrix(dim3 Gr, dim3 Bl, float* mat, MatrixDim d); +void cudaF_transpose_matrix(dim3 Gr, dim3 Bl, float* mat, MatrixDim d); void cudaF_sy_add_tr2(dim3 Gr, dim3 Bl, float alpha, float beta, const float* T, MatrixDim tdim, float *S, MatrixDim sdim); void cudaF_sum_column_ranges(dim3 Gr, dim3 Bl, float *data, MatrixDim dim, const float *src_data, MatrixDim src_dim, - const Int32Pair *indices); + const Int32Pair *indices); void cudaF_add_row_ranges(dim3 Gr, dim3 Bl, float *data, MatrixDim dim, const float *src_data, MatrixDim src_dim, - const Int32Pair *indexes); + const Int32Pair *indexes); void cudaF_matrix_lookup(dim3 Gr, dim3 Bl, const float *data, MatrixDim dim, const Int32Pair *indices, int indices_size, float *output); @@ -176,28 +179,27 @@ void cudaF_matrix_lookup(dim3 Gr, dim3 Bl, const float *data, MatrixDim dim, void cudaF_equal_element_mask(dim3 Gr, dim3 Bl, const float *mat1, const float *mat2, float *mask, MatrixDim mat1_dim, int mat2_stride, int mask_stride); - + /********************************************************* * double CUDA kernel calls */ /* - * CuMatrix + * CuMatrix */ void cudaD_copy_upp_low(dim3 Gr, dim3 Bl, double* A, MatrixDim dimB); void cudaD_copy_low_upp(dim3 Gr, dim3 Bl, double* A, MatrixDim dimA); void cudaD_add_diag_vec_mat(dim3 Gr, dim3 Bl, double alpha, double *mat, MatrixDim mat_dim, const double *vec, const double *mat2, int mat2_row_stride, - int mat2_col_stride, double beta); + int mat2_col_stride, double beta); void cudaD_copy_from_tp_trans(dim3 Gr, dim3 Bl, double* A, const double* B, MatrixDim dmat); void cudaDF_copy_from_tp_trans(dim3 Gr, dim3 Bl, double* A, const float* B, MatrixDim dmat); void cudaD_copy_from_tp(dim3 Gr, dim3 Bl, double* A, const double* B, MatrixDim dmat); void cudaDF_copy_from_tp(dim3 Gr, dim3 Bl, double* A, const float* B, MatrixDim dmat); -void cudaD_copy_col_from_vec(int Gr, int Bl, double* mat, const double* v, int col, MatrixDim d); void cudaD_apply_exp(dim3 Gr, dim3 Bl, double* mat, MatrixDim d); void cudaD_apply_pow(dim3 Gr, dim3 Bl, double* mat, double power, MatrixDim d); void cudaD_apply_pow_abs(dim3 Gr, dim3 Bl, double* mat, double power, bool include_sign, MatrixDim d); -void cudaD_apply_heaviside(dim3 Gr, dim3 Bl, double* mat, MatrixDim d); +void cudaD_apply_heaviside(dim3 Gr, dim3 Bl, double* mat, MatrixDim d); void cudaD_apply_floor(dim3 Gr, dim3 Bl, double* mat, double floor_val, MatrixDim d); void cudaD_copy_cols(dim3 Gr, dim3 Bl, double* dst, const double* src, const MatrixIndexT_cuda* reorder, MatrixDim dst_dim, int src_stride); void cudaD_add_cols(dim3 Gr, dim3 Bl, double* dst, const double* src, const MatrixIndexT_cuda* reorder, MatrixDim dst_dim, int src_stride); @@ -228,6 +230,7 @@ void cudaD_calc_pnorm_deriv(dim3 Gr, dim3 Bl, double *y, const double *x1, const void cudaD_calc_group_max_deriv(dim3 Gr, dim3 Bl, double *y, const double *x1, const double *x2, MatrixDim d, int src_stride, int group_size); void cudaD_div_rows_vec(dim3 Gr, dim3 Bl, double *mat, const double *vec_div, MatrixDim d); void cudaD_add_mat(dim3 Gr, dim3 Bl, double alpha, const double *src, double *dst, MatrixDim d, int src_stride, int A_trans); +void cudaD_add_mat_blocks(dim3 Gr, dim3 Bl, double alpha, const double *src, int32_cuda num_row_blocks, int32_cuda num_col_blocks, double *dst, MatrixDim d, int src_stride, int A_trans); void cudaD_add_mat_mat_div_mat(dim3 Gr, dim3 Bl, const double *A, const double *B, const double *C, double *dst, MatrixDim d, int stride_a, int stride_b, int stride_c); void cudaD_add_vec_to_cols(dim3 Gr, dim3 Bl, double alpha, const double *col, double beta, double *dst, MatrixDim d); void cudaD_add_vec_to_rows(dim3 Gr, dim3 Bl, double alpha, const double *row, double beta, double *dst, MatrixDim d); @@ -245,19 +248,19 @@ void cudaD_vec_mul_elements(int Gr, int Bl, double* v, const double* a, int dim) void cudaD_vec_soft_max(int Gr, int Bl, double* v, int dim); void cudaD_vec_min(const double* v, double* value, int dim); void cudaD_vec_max(const double* v, double* value, int dim); -void cudaD_trace_mat_mat_trans(const double* A, const double* B, MatrixDim dA, int B_stride, double* value); -void cudaD_trace_mat_mat(const double* A, const double* B, MatrixDim dA, int B_stride, double* value); -void cudaD_add_diag_mat_mat(int Gr, int Bl, double alpha, double* v, int v_dim, const double* M, - int M_cols, int M_row_stride, int M_col_stride, const double *N, int N_row_stride, - int N_col_stride, int threads_per_element, double beta); +void cudaD_trace_mat_mat_trans(dim3 Gr, dim3 Bl, const double* A, const double* B, MatrixDim dA, int B_stride, double* value); +void cudaD_trace_mat_mat(dim3 Gr, dim3 Bl, const double* A, const double* B, MatrixDim dA, int B_stride, double* value); +void cudaD_add_diag_mat_mat(int Gr, int Bl, double alpha, double* v, int v_dim, const double* M, + int M_cols, int M_row_stride, int M_col_stride, const double *N, int N_row_stride, + int N_col_stride, int threads_per_element, double beta); void cudaD_add_vec_vec(int Gr, int Bl, double alpha, double* v, const double* x, const double* y, double beta, int dim); -void cudaD_copy_col_from_mat(int Gr, int Bl, double* v, int col, const double* mat, MatrixDim dmat, int dim); void cudaD_copy_col_from_mat_df(int Gr, int Bl, double* v, int col, const double* mat, MatrixDim dmat, int dim); void cudaD_copy_col_from_mat_fd(int Gr, int Bl, float* v, int col, const double* mat, MatrixDim dmat, int dim); void cudaD_vec_sum(int Gr, int Bl, double* v, double* value, int dim, int inc); void cudaD_pvec_sum(int Gr, int Bl, double* vec, double* pvec_sum, int dim, int size); void cudaD_vec_copy_diag_from_packed(int Gr, int Bl, double *dst, const double *src, int dim); void cudaD_vec_apply_floor(int Gr, int Bl, double* v, double floor_val, float* num, int dim); +void cudaD_vec_apply_ceiling(int Gr, int Bl, double* v, double ceiling_val, float* num, int dim); void cudaD_vec_apply_exp(int Gr, int Bl, double* v, int dim); void cudaD_vec_apply_log(int Gr, int Bl, double* v, double* flag, int dim); void cudaD_trace(int Gr, int Bl, double* mat, double* value, int dim); @@ -270,7 +273,7 @@ void cudaD_add_mat_blockmat(dim3 Gr, dim3 Bl, double *data, MatrixDim d, const d void cudaD_block_add_mat_mat(dim3 Gr, dim3 Bl, CuBlockMatrixData *B_cu_data, int num_blocks, const double *C_data, int C_num_cols, int C_row_stride, int C_col_stride, const double *D_data, int D_row_stride, int D_col_stride, - double alpha, double beta); + double alpha, double beta); /* @@ -282,12 +285,13 @@ void cudaD_soft_hinge(dim3 Gr, dim3 Bl, double *y, const double *x, MatrixDim d, void cudaD_group_pnorm(dim3 Gr, dim3 Bl, double *y, const double *x, MatrixDim d, int src_stride, int group_size, double power); void cudaD_group_max(dim3 Gr, dim3 Bl, double *y, const double *x, MatrixDim d, int src_stride, int group_size); void cudaD_sigmoid(dim3 Gr, dim3 Bl, double *y, const double *x, MatrixDim d, int src_stride); +void cudaD_heaviside(dim3 Gr, dim3 Bl, double *y, const double *x, MatrixDim d, int src_stride); void cudaD_diff_sigmoid(dim3 Gr, dim3 Bl, double *eout, const double *e, const double *y, MatrixDim d, int e_stride, int y_stride); void cudaD_tanh(dim3 Gr, dim3 Bl, double *y, const double *x, MatrixDim d, int src_stride); void cudaD_diff_tanh(dim3 Gr, dim3 Bl, double *eout, const double *e, const double *y, MatrixDim d, int e_stride, int y_stride); void cudaD_regularize_l1(dim3 Gr, dim3 Bl, double *wei, double *grad, double l1, double lr, MatrixDim d, int stride_grad); -void cudaD_find_row_max_id(dim3 Gr, dim3 Bl, const double *mat, double *vec_val, int32_cuda *vec_id, int32_cuda voff, MatrixDim d); +void cudaD_find_row_max_id(dim3 Gr, dim3 Bl, const double *mat, double *vec_val, int32_cuda *vec_id, MatrixDim d); void cudaD_diff_xent(dim3 Gr, dim3 Bl, const int32_cuda *vec_tgt, double *mat_net_out, double *vec_log_post, MatrixDim d); void cudaD_copy_rows_from_vec(dim3 Gr, dim3 Bl, double *mat_out, MatrixDim d_out, const double *v_in); @@ -320,17 +324,13 @@ void cuda_copy_from_smat_fd_trans(dim3 Gr, dim3 Bl, float* mat_out, const Matrix void cuda_copy_from_smat_df_trans(dim3 Gr, dim3 Bl, double* mat_out, const MatrixElement* smat_in, MatrixDim d_out, MatrixIndexT_cuda d_in); void cuda_copy_from_smat_dd_trans(dim3 Gr, dim3 Bl, double* mat_out, const MatrixElement* smat_in, MatrixDim d_out, MatrixIndexT_cuda d_in); -void cuda_copy_from_smat_as_vec_ff(dim3 Gr, dim3 Bl, float* vec_out, const MatrixElement* smat_in, MatrixIndexT_cuda d_in); -void cuda_copy_from_smat_as_vec_fd(dim3 Gr, dim3 Bl, float* vec_out, const MatrixElement* smat_in, MatrixIndexT_cuda d_in); -void cuda_copy_from_smat_as_vec_df(dim3 Gr, dim3 Bl, double* vec_out, const MatrixElement* smat_in, MatrixIndexT_cuda d_in); -void cuda_copy_from_smat_as_vec_dd(dim3 Gr, dim3 Bl, double* vec_out, const MatrixElement* smat_in, MatrixIndexT_cuda d_in); - void cudaF_trace_mat_smat(dim3 Gr, dim3 Bl, const float* mat_in, const MatrixElement* smat_in, MatrixDim mat_d_in, MatrixIndexT_cuda smat_d_in, float* trace_vec_out); void cudaF_trace_mat_smat_trans(dim3 Gr, dim3 Bl, const float* mat_in, const MatrixElement* smat_in, MatrixDim mat_d_in, MatrixIndexT_cuda smat_d_in, float* trace_vec_out); void cudaD_trace_mat_smat(dim3 Gr, dim3 Bl, const double* mat_in, const MatrixElement* smat_in, MatrixDim mat_d_in, MatrixIndexT_cuda smat_d_in, double* trace_vec_out); void cudaD_trace_mat_smat_trans(dim3 Gr, dim3 Bl, const double* mat_in, const MatrixElement* smat_in, MatrixDim mat_d_in, MatrixIndexT_cuda smat_d_in, double* trace_vec_out); -void cudaD_matrix_add_elements(dim3 Gr, dim3 Bl, double *data, MatrixDim dim, double alpha, MatrixElement* x, int s); +void cudaD_matrix_add_elements(dim3 Gr, dim3 Bl, double *data, MatrixDim dim, double alpha, MatrixElement* x, int num_elements); +void cudaD_matrix_add_indexed_values(dim3 Gr, dim3 Bl, MatrixDim dim, double alpha, const Int32Pair* indices, const double* x, int s, double* data); void cudaD_comp_obj_deriv(dim3 Gr,dim3 Bl, MatrixElement* x, int s, const double* z, MatrixDim d, double* z2, MatrixDim d2, double* t); void cudaD_transpose_matrix(dim3 Gr, dim3 Bl, double* mat, MatrixDim d); @@ -345,14 +345,14 @@ void cudaD_add_row_ranges(dim3 Gr, dim3 Bl, double *data, MatrixDim dim, void cudaD_matrix_lookup(dim3 Gr, dim3 Bl, const double *data, MatrixDim dim, const Int32Pair *indices, int indices_size, double *output); - + void cudaD_equal_element_mask(dim3 Gr, dim3 Bl, const double *mat1, - const double *mat2, double *mask, MatrixDim mat1_dim, + const double *mat2, double *mask, MatrixDim mat1_dim, int mat2_stride, int mask_stride); - - -} // extern "C" + + +} // extern "C" #endif // HAVE_CUDA diff --git a/src/cudamatrix/cu-kernels.cu b/src/cudamatrix/cu-kernels.cu index 00d6c71ab2d..c2d8b45174a 100644 --- a/src/cudamatrix/cu-kernels.cu +++ b/src/cudamatrix/cu-kernels.cu @@ -25,7 +25,7 @@ // In this file is the CUDA code of the CUDA kernels, plus the ANSI-C wrappers #include -#include "cu-kernels-ansi.h" +#include "cudamatrix/cu-kernels-ansi.h" /*********************************************************************** @@ -35,7 +35,7 @@ template __device__ static Real _sum_reduce(Real buffer[]) { // Total number of active threads - int32_cuda nTotalThreads = blockDim.x; + int32_cuda nTotalThreads = blockDim.x; __syncthreads(); // perform tree-based reduction (sum) while(nTotalThreads > 1) { @@ -70,7 +70,7 @@ static Real _min_reduce(Real buffer[]) { if (threadIdx.x < halfPoint) { if (threadIdx.x + halfPoint < nTotalThreads) { Real temp = buffer[threadIdx.x + halfPoint]; - if (temp < buffer[threadIdx.x]) + if (temp < buffer[threadIdx.x]) buffer[threadIdx.x] = temp; } } @@ -86,7 +86,7 @@ template __device__ static Real _max_reduce(Real buffer[]) { // Total number of active threads - int32_cuda nTotalThreads = blockDim.x; + int32_cuda nTotalThreads = blockDim.x; __syncthreads(); // perform tree-based reduction (max) while(nTotalThreads > 1) { @@ -96,7 +96,7 @@ static Real _max_reduce(Real buffer[]) { // Get the shared value stored by another thread if(threadIdx.x+halfPoint < nTotalThreads) { Real temp = buffer[threadIdx.x + halfPoint]; - if (temp > buffer[threadIdx.x]) + if (temp > buffer[threadIdx.x]) buffer[threadIdx.x] = temp; } } @@ -113,7 +113,7 @@ template __device__ static int32_cuda _max_id_reduce(Real val[], int32_cuda idx[]) { // Total number of active threads - int32_cuda nTotalThreads = blockDim.x; + int32_cuda nTotalThreads = blockDim.x; __syncthreads(); // perform tree-based reduction (get index of maximum) while(nTotalThreads > 1) { @@ -175,17 +175,14 @@ __global__ static void _add_diag_vec_mat(Real alpha, Real *mat, MatrixDim mat_dim, const Real *vec, const Real *mat2, int mat2_row_stride, int mat2_col_stride, Real beta) { - // Note from Dan: in this kernel, we make the x dimension correspond to the - // row index and y to the column index. That was not always the case for - // earlier kernels written by others. - int i = blockIdx.y * blockDim.y + threadIdx.y; // row index - int j = blockIdx.x * blockDim.x + threadIdx.x; // column index - - int index = i * mat_dim.stride + j, - index2 = i * mat2_row_stride + j * mat2_col_stride; - - if (i < mat_dim.rows && j < mat_dim.cols) { - mat[index] = alpha * vec[i] * mat2[index2] + beta * mat[index]; + int i = blockIdx.x * blockDim.x + threadIdx.x; // column index + int j = blockIdx.y * blockDim.y + threadIdx.y; // row index + + int index = j * mat_dim.stride + i, + index2 = j * mat2_row_stride + i * mat2_col_stride; + + if (i < mat_dim.cols && j < mat_dim.rows) { + mat[index] = alpha * vec[j] * mat2[index2] + beta * mat[index]; } } @@ -193,13 +190,12 @@ static void _add_diag_vec_mat(Real alpha, Real *mat, MatrixDim mat_dim, template __global__ static void _copy_from_tp(Real* A, const OtherReal* B, MatrixDim dmat) { - int32_cuda i = blockIdx.x * blockDim.x + threadIdx.x; - int32_cuda j = blockIdx.y * blockDim.y + threadIdx.y; - - if (i < dmat.rows && j < dmat.cols) { - int32_cuda index_B = (i * (i+1) / 2) + j; - int32_cuda index_A = i * dmat.stride + j; - if (j <= i) { + int32_cuda i = blockIdx.x * blockDim.x + threadIdx.x; // col index + int32_cuda j = blockIdx.y * blockDim.y + threadIdx.y; // row index + if (i < dmat.cols && j < dmat.rows) { + int32_cuda index_B = (j * (j+1) / 2) + i; + int32_cuda index_A = j * dmat.stride + i; + if (i <= j) { A[index_A] = B[index_B]; } else { A[index_A] = 0.0; @@ -211,6 +207,8 @@ static void _copy_from_tp(Real* A, const OtherReal* B, MatrixDim dmat) { template __global__ static void _copy_from_tp_trans(Real* A, const OtherReal* B, MatrixDim dmat) { + // we interpret these indexes oppositely from normal, but it doesn't + // matter as it's invoked in a symmetric way. int32_cuda i = blockIdx.x * blockDim.x + threadIdx.x; int32_cuda j = blockIdx.y * blockDim.y + threadIdx.y; // transpose the indices used to index the source TpMatrix. @@ -226,32 +224,54 @@ static void _copy_from_tp_trans(Real* A, const OtherReal* B, MatrixDim dmat) { } -// for this kernel, following the newer pattern, the x-dim is the row-index, the -// y-dim is the col-index. template __global__ static void _copy_from_mat(Real* mat_out, const OtherReal* mat_in, MatrixDim d_out, MatrixDim d_in) { - int32_cuda i = blockIdx.x * blockDim.x + threadIdx.x; // row-index - int32_cuda j = blockIdx.y * blockDim.y + threadIdx.y; // col-index. - int32_cuda index_out = j + i * d_out.stride; - int32_cuda index_in = j + i * d_in.stride; - if (i < d_out.rows && j < d_out.cols) + int32_cuda i = blockIdx.x * blockDim.x + threadIdx.x; // col-index + int32_cuda j = blockIdx.y * blockDim.y + threadIdx.y; // row-index. + int32_cuda index_out = i + j * d_out.stride; + int32_cuda index_in = i + j * d_in.stride; + if (i < d_out.cols && j < d_out.rows) mat_out[index_out] = static_cast(mat_in[index_in]); } +template +__global__ +static void _copy_from_mat_trans(Real* mat_out, const OtherReal* mat_in, + MatrixDim d_out, MatrixDim d_in) { + // Use shared meme to achieve both coalesced memory reading and writing + // '+1' to avoid bank conflict when reading sbuf + __shared__ Real sbuf[TileDim][TileDim + 1]; + const int32_cuda i_in = blockIdx.y * TileDim + threadIdx.y; // row-index + const int32_cuda j_in = blockIdx.x * TileDim + threadIdx.x; // col-index + const int32_cuda tile_stride_in = CU1DBLOCK / TileDim * d_in.stride; + int32_cuda index_in = i_in * d_in.stride + j_in; -// for this kernel, the x-dim is the row-index at the output, the y-dim is the -// col-index at the output -template -__global__ -static void _copy_from_mat_trans(Real* mat_out, const OtherReal* mat_in, MatrixDim d_out, MatrixDim d_in) { - int32_cuda i = blockIdx.x * blockDim.x + threadIdx.x; // row-index out - int32_cuda j = blockIdx.y * blockDim.y + threadIdx.y; // col-index out - int32_cuda index_out = j + i * d_out.stride; - int32_cuda index_in = i + j * d_in.stride; - if (i < d_out.rows && j < d_out.cols) - mat_out[index_out] = static_cast(mat_in[index_in]); +# pragma unroll + for (int i = 0; i < TileDim; i += CU1DBLOCK / TileDim) { + if (i_in + i < d_in.rows && j_in < d_in.cols) { + sbuf[threadIdx.y + i][threadIdx.x] = static_cast(mat_in[index_in]); + } + index_in += tile_stride_in; + } + __syncthreads(); + + // Grid is transposed, but block is not yet. + // Warp (blockDim.x) is always along the row-dim. + const int32_cuda i_out = blockIdx.x * TileDim + threadIdx.y; + const int32_cuda j_out = blockIdx.y * TileDim + threadIdx.x; + const int32_cuda tile_stride_out = CU1DBLOCK / TileDim * d_out.stride; + int32_cuda index_out = i_out * d_out.stride + j_out; + +# pragma unroll + for (int i = 0; i < TileDim; i += CU1DBLOCK / TileDim) { + if (i_out + i < d_out.rows && j_out < d_out.cols) { + // block is tranposed when reading sbuf + mat_out[index_out] = sbuf[threadIdx.x][threadIdx.y + i]; + } + index_out += tile_stride_out; + } } template @@ -272,14 +292,6 @@ static void _copy_from_smat_trans(Real* mat_out, const MatrixElement* mat_out[data_index] = smat_in[smat_index].weight; } -template -__global__ -static void _copy_from_smat_as_vec(Real* vec_out, const MatrixElement* smat_in, MatrixIndexT_cuda d_in) { - int smat_index = blockIdx.x * blockDim.x + threadIdx.x; - if (smat_index >= d_in) return; - vec_out[smat_index] = smat_in[smat_index].weight; -} - template __global__ static void _trace_mat_smat_trans(const Real* mat_in, const MatrixElement* smat_in, MatrixDim mat_d_in, MatrixIndexT_cuda smat_d_in, Real* trace_vec_out) { @@ -312,25 +324,13 @@ static void _transpose_matrix(Real* mat, MatrixDim d) { mat[index_b] = a; } - -template -__global__ -static void _copy_col_from_vec(Real* mat, const Real* v, int col, MatrixDim d) { - int32_cuda i = blockIdx.x * blockDim.x + threadIdx.x; - if ( i < d.rows ) { - int32_cuda index = col + i * d.stride; - mat[index] = v[i]; - } -} - - template __global__ static void _apply_exp(Real* mat, MatrixDim d) { int32_cuda i = blockIdx.x * blockDim.x + threadIdx.x; int32_cuda j = blockIdx.y * blockDim.y + threadIdx.y; int32_cuda index = i + j * d.stride; - if ( i < d.cols && j < d.rows ) { + if (i < d.cols && j < d.rows) { mat[index] = exp(mat[index]); } } @@ -380,8 +380,8 @@ static void _add_diag_packed(Real* mat, Real value, int dim) { template __global__ static void _set_const(Real* mat, Real value, MatrixDim d) { - int32_cuda i = blockIdx.x * blockDim.x + threadIdx.x; - int32_cuda j = blockIdx.y * blockDim.y + threadIdx.y; + int32_cuda i = blockIdx.x * blockDim.x + threadIdx.x; // column + int32_cuda j = blockIdx.y * blockDim.y + threadIdx.y; // row int32_cuda index = i + j * d.stride; if (i < d.cols && j < d.rows) mat[index] = value; @@ -495,14 +495,14 @@ static void _mul_rows_vec(Real* mat, const Real* scale, MatrixDim d) { template __global__ -static void _mul_rows_group_mat(Real *y, const Real *x, MatrixDim d, +static void _mul_rows_group_mat(Real *y, const Real *x, MatrixDim d, int src_stride, int group_size) { int i = blockIdx.x * blockDim.x + threadIdx.x; int j = blockIdx.y * blockDim.y + threadIdx.y; - if (j < d.rows && i < d.cols ) { + if (j < d.rows && i < d.cols ) { int dst_index = i + j * d.stride; int src_index = i / group_size + j * src_stride; - y[dst_index] *= x[src_index]; + y[dst_index] *= x[src_index]; } } @@ -514,7 +514,7 @@ static void _calc_pnorm_deriv(Real *deriv, const Real *vec, const Real *norm, MatrixDim d, int src_stride, int group_size, Real power) { int i = blockIdx.x * blockDim.x + threadIdx.x; int j = blockIdx.y * blockDim.y + threadIdx.y; - if (j < d.rows && i < d.cols ) { + if (j < d.rows && i < d.cols ) { int dst_index = i + j * d.stride, src_index = i / group_size + j * src_stride; Real vec_element = vec[dst_index], // this is the element of the original vector. @@ -571,7 +571,7 @@ static void _div_rows_vec(Real* mat, const Real* vec_div, MatrixDim d) { inv[threadIdx.y] = 1.0/vec_div[j]; } __syncthreads(); - + //multiply elements if (i < d.cols && j < d.rows) mat[index] *= inv[threadIdx.y]; @@ -581,12 +581,12 @@ static void _div_rows_vec(Real* mat, const Real* vec_div, MatrixDim d) { template __global__ static void _add_mat(Real alpha, const Real* src, Real* dst, MatrixDim d, int src_stride) { - int32_cuda i = blockIdx.x * blockDim.x + threadIdx.x; - int32_cuda j = blockIdx.y * blockDim.y + threadIdx.y; - int32_cuda index = i + j*d.stride; - int32_cuda index_src = i + j*src_stride; + int32_cuda i = blockIdx.x * blockDim.x + threadIdx.x; // column index + int32_cuda j = blockIdx.y * blockDim.y + threadIdx.y; // row index + int32_cuda index = i + j * d.stride; + int32_cuda index_src = i + j * src_stride; if (i < d.cols && j < d.rows) - dst[index] = alpha*src[index_src] + dst[index]; + dst[index] = alpha * src[index_src] + dst[index]; } template @@ -602,7 +602,37 @@ static void _add_mat_trans(Real alpha, const Real* src, Real* dst, MatrixDim d, template __global__ -static void _add_mat_mat_div_mat(const Real* A, const Real* B, const Real* C, Real* dst, MatrixDim d, int stride_a, +static void _add_mat_blocks(Real alpha, const Real* src, int32_cuda num_row_blocks, int32_cuda num_col_blocks, Real* dst, MatrixDim d, int src_stride) { + int32_cuda i = blockIdx.x * blockDim.x + threadIdx.x; + int32_cuda j = blockIdx.y * blockDim.y + threadIdx.y; + int32_cuda index = i + j * d.stride; + int32_cuda index_src = i + j * src_stride; + if (i < d.cols && j < d.rows) + for (int32_cuda p = 0; p < num_row_blocks; p++) { + for (int32_cuda q = 0; q < num_col_blocks; q++) { + dst[index] = alpha * src[index_src + p * src_stride * d.rows + q * d.cols] + dst[index]; + } + } +} + +template +__global__ +static void _add_mat_blocks_trans(Real alpha, const Real* src, int32_cuda num_row_blocks, int32_cuda num_col_blocks, Real* dst, MatrixDim d, int src_stride) { + int32_cuda i = blockIdx.x * blockDim.x + threadIdx.x; + int32_cuda j = blockIdx.y * blockDim.y + threadIdx.y; + int32_cuda index = i + j * d.stride; + int32_cuda index_src = j + i * src_stride; + if (i < d.cols && j < d.rows) + for (int32_cuda p = 0; p < num_row_blocks; p++) { + for (int32_cuda q = 0; q < num_col_blocks; q++) { + dst[index] = alpha * src[index_src + p * src_stride * d.cols + q * d.rows] + dst[index]; + } + } +} + +template +__global__ +static void _add_mat_mat_div_mat(const Real* A, const Real* B, const Real* C, Real* dst, MatrixDim d, int stride_a, int stride_b, int stride_c) { int32_cuda i = blockIdx.x * blockDim.x + threadIdx.x; int32_cuda j = blockIdx.y * blockDim.y + threadIdx.y; @@ -619,7 +649,7 @@ static void _add_mat_mat_div_mat(const Real* A, const Real* B, const Real* C, Re // Given a matrix input S (not packed!) and a lower-triangular matrix L, // this function does S = beta S + alpha * L^T L. This is used in PSD matrix inversion. -// The i index is the row of the destination S and the j the column (although of +// The i index is the row of the destination S and the j the column (although of // course the output is symmetric so it doesn't matter in a sense). The main point // of this is to make use of various symmetries and zero-ness. template @@ -628,14 +658,14 @@ static void _sy_add_tr2(Real alpha, Real beta, const Real *T, MatrixDim tdim, Re MatrixDim sdim) { int i = blockIdx.x * blockDim.x + threadIdx.x; int j = blockIdx.y * blockDim.y + threadIdx.y; - + if (i >= sdim.rows || j > i) return; // this thread computes the dot-product of the i'th column of // L with the j'th column of L. The values we're multiplying // are only nonzero for row-index k greater or equal to // max(i, j), which equals i. - + Real sum = 0.0; for (int k = i; k < sdim.rows; k++) { int i_index = i + tdim.stride * k, @@ -682,27 +712,22 @@ static void _apply_mask(Real* mat, const char* mask, MatrixDim dmat, MatrixDim d int32_cuda j = blockIdx.y * blockDim.y + threadIdx.y; int32_cuda index = i + j*dmat.stride; int32_cuda index2 = i + j*dmask.stride; - if ( i < dmat.cols && j < dmat.rows ) + if ( i < dmat.cols && j < dmat.rows ) if(mask[index2] == 0) mat[index] = 0; } template __global__ static void _add_mat_diag_vec(Real alpha, Real *mat, MatrixDim mat_dim, - const Real *mat2, int mat2_row_stride, int mat2_col_stride, + const Real *mat2, int mat2_row_stride, int mat2_col_stride, const Real *vec, Real beta) { - // Note from Dan: in this kernel, we make the x dimension correspond to the - // row index and y to the column index. That was not always the case for - // earlier kernels written by others. - int i = blockIdx.x * blockDim.x + threadIdx.x; // row index - int j = blockIdx.y * blockDim.y + threadIdx.y; // column index - - int index = i * mat_dim.stride + j, - index2 = i * mat2_row_stride + j * mat2_col_stride; - - if (i < mat_dim.rows && j < mat_dim.cols) { - mat[index] = alpha * mat2[index2] * vec[j] + beta * mat[index]; - } + int i = blockIdx.x * blockDim.x + threadIdx.x; // column index + int j = blockIdx.y * blockDim.y + threadIdx.y; // row index + + int index = i + j * mat_dim.stride, + index2 = i * mat2_col_stride + j * mat2_row_stride; + if (j < mat_dim.rows && i < mat_dim.cols) + mat[index] = alpha * mat2[index2] * vec[i] + beta * mat[index]; } template @@ -738,7 +763,7 @@ static void _set_bias_params(Real* v, const Real* a, Real param_1, Real param_2, v[i] = v[i] / factor; } else if ( ratio > param_1 ) { Real factor = ((ratio/param_1) > param_2) ? param_2 : (ratio/param_1); - v[i] = v[i] * factor; + v[i] = v[i] * factor; } } } @@ -749,7 +774,7 @@ __global__ static void _copy_from_vec_df(double* v_out, const Real* v_in, int dim) { int32_cuda i = blockIdx.x * blockDim.x + threadIdx.x; // if (blockIdx.y > 0) return; - + if (i < dim) { v_out[i] = (double) v_in[i]; } @@ -757,30 +782,25 @@ static void _copy_from_vec_df(double* v_out, const Real* v_in, int dim) { // This kernel writes a copy of the vector "v_in" to each row of the matrix -// "m_out". the dimension of v_in should be equal to the #columns of m_out. In -// this kernel, following the new pattern, x corresponds to row-index and y to -// column-index. +// "m_out". the dimension of v_in should be equal to the #columns of m_out. template __global__ static void _copy_rows_from_vec(Real* m_out, MatrixDim d, const Real* v_in) { - int i = blockIdx.x * blockDim.x + threadIdx.x; // row index. - int j = blockIdx.y * blockDim.y + threadIdx.y; // column index. - - if (i < d.rows && j < d.cols) { - int index = i * d.stride + j; - m_out[index] = v_in[j]; + int i = blockIdx.x * blockDim.x + threadIdx.x; // column index. + int j = blockIdx.y * blockDim.y + threadIdx.y; // row index. + if (i < d.cols && j < d.rows) { + int index = i + j * d.stride; + m_out[index] = v_in[i]; } } - - template __global__ static void _copy_from_vec_fd(float* v_out, const Real* v_in, int dim) { int32_cuda i = blockIdx.x * blockDim.x + threadIdx.x; // if (blockIdx.y > 0) return; - - if ( i < dim) { + + if (i < dim) { v_out[i] = (float) v_in[i]; } } @@ -792,7 +812,7 @@ static void _vec_min(const Real* v, Real* value, int dim) { int32_cuda i = blockIdx.x * blockDim.x + threadIdx.x; if(i >= CU1DBLOCK) return; - + __shared__ Real row_data[CU1DBLOCK]; int block_size = (dim + CU1DBLOCK - 1) / CU1DBLOCK; @@ -841,84 +861,128 @@ static void _vec_max(const Real* v, Real* value, int dim) { } -// _trace_mat_mat expects to be called with 1 blocks, each of dimension -// CU1DBLOCK. Each block outputs a partial sum to value[blockIdx.x], -// i.e. value[0 through 0]. -template -__global__ -static void _trace_mat_mat(const Real* A, const Real* B, MatrixDim dA, int B_stride, Real* value) { - int32_cuda i = blockIdx.x * blockDim.x + threadIdx.x; +// _trace_mat_mat reduce the partial sum to value[blockIdx.y * gridDim.x + blockIdx.x] +// It use shared mem to transpose matrix B to ensure coalesced memory access +template +__global__ +static void _trace_mat_mat(const Real* A, const Real* B, MatrixDim dA, + int B_stride, Real* value) { + // Reuse shared mem and make indexing easier. "+1" to avoid bank conflict + __shared__ union { + Real trans[TileDim][TileDim + 1]; + Real sum[CU1DBLOCK]; + } smem; + const int32_cuda tid = threadIdx.y * blockDim.x + threadIdx.x; // linear thread id; + const int32_cuda grid_height = gridDim.y * TileDim; + + const int32_cuda ja = blockIdx.x * TileDim + threadIdx.x; + const int32_cuda ib = blockIdx.x * TileDim + threadIdx.y; + int32_cuda ia = blockIdx.y * TileDim + threadIdx.y; + int32_cuda jb = blockIdx.y * TileDim + threadIdx.x; + + // Grid reduce + Real tsum = Real(0); + for (int32_cuda i0 = 0; i0 < dA.rows; i0 += grid_height) { + // Load from B, transpose the block and store in shared mem + if (jb < dA.rows) { +# pragma unroll + for (int i = 0; i < TileDim; i += CU1DBLOCK / TileDim) { + if (ib + i < dA.cols) { + smem.trans[threadIdx.x][threadIdx.y + i] = + B[(ib + i) * B_stride + jb]; + } + } + } + __syncthreads(); - if(blockIdx.x > num_blocks || threadIdx.x > CU1DBLOCK) return; - - int num_elements = dA.rows * dA.cols, - num_threads = CU1DBLOCK * num_blocks; - int block_size = (num_elements + num_threads - 1) / num_threads; - int loop_start = i * block_size, loop_end = (i + 1) * block_size; - if (loop_end > num_elements) - loop_end = num_elements; + // Load from A, sum up the product. + if (ja < dA.cols) { +# pragma unroll + for (int i = 0; i < TileDim; i += CU1DBLOCK / TileDim) { + if (ia + i < dA.rows) { + tsum += A[(ia + i) * dA.stride + ja] + * smem.trans[threadIdx.y + i][threadIdx.x]; + } + } + } + __syncthreads(); - Real sum = 0.0; - for (int j = loop_start; j < loop_end; j++) { - // for (int j = i; j < num_elements; j += num_threads) { - int row = j / dA.cols, col = j % dA.cols; // "row" is row-index in A, "col" is - // col-index in A; in B, it's reversed. - int index_A = col + row * dA.stride, - index_B = row + col * B_stride; - sum += A[index_A] * B[index_B]; + ia += grid_height; + jb += grid_height; } - __shared__ Real row_data[CU1DBLOCK]; - - row_data[threadIdx.x] = sum; + smem.sum[tid] = tsum; __syncthreads(); - Real ans = _sum_reduce(row_data); - if (threadIdx.x == 0) - value[blockIdx.x] = ans; + // Block reduce +# pragma unroll + for (int shift = CU1DBLOCK / 2; shift > warpSize; shift >>= 1) { + if (tid < shift) + smem.sum[tid] += smem.sum[tid + shift]; + __syncthreads(); + } + + // Warp reduce. Implicitly synchronized within a warp. + if (tid < warpSize) { +# pragma unroll + for (int shift = warpSize; shift > 0; shift >>= 1) { + smem.sum[tid] += smem.sum[tid + shift]; + } + } + + // output 1 sum per thread block + if (tid == 0) { + value[blockIdx.y * gridDim.x + blockIdx.x] = smem.sum[0]; + } } -// _trace_mat_mat_trans expects to be called with 4 blocks, each of dimension -// CU1DBLOCK. Each block outputs a partial sum to value[blockIdx.x], -// i.e. value[0 through 3]. -template +// _trace_mat_mat_trans reduce the partial sum to value[blockIdx.y * gridDim.x + blockIdx.x] +template __global__ static void _trace_mat_mat_trans(const Real* A, const Real* B, MatrixDim dA, int B_stride, Real* value) { - int32_cuda i = blockIdx.x * blockDim.x + threadIdx.x; - - if(blockIdx.x > num_blocks || threadIdx.x > CU1DBLOCK) return; - - int num_elements = dA.rows * dA.cols, - num_threads = CU1DBLOCK * num_blocks; - // int block_size = (num_elements + num_threads - 1) / num_threads; - // int loop_start = i * block_size, loop_end = (i + 1) * block_size; - // if (loop_end > num_elements) - // loop_end = num_elements; - - Real sum = 0.0; - // for (int j = loop_start; j < loop_end; j++) { - for (int j = i; j < num_elements; j += num_threads) { - int row = j / dA.cols, col = j % dA.cols; // "row" is row-index in A, "col" is - // col-index in A; in B, it's reversed. - int index_A = col + row * dA.stride, - index_B = col + row * B_stride; - sum += A[index_A] * B[index_B]; + __shared__ Real ssum[CU1DBLOCK]; + const int32_cuda tid = threadIdx.y * blockDim.x + threadIdx.x; // linear thread id; + const int32_cuda j = blockIdx.x * blockDim.x + threadIdx.x; + const int32_cuda grid_height = gridDim.y * blockDim.y; + int32_cuda i = blockIdx.y * blockDim.y + threadIdx.y; + + // Grid reduce + Real tsum = Real(0); + if (j < dA.cols) { + while (i < dA.rows) { + tsum += A[i * dA.stride + j] * B[i * B_stride + j]; + i += grid_height; + } } - __shared__ Real row_data[CU1DBLOCK]; + ssum[tid] = tsum; + __syncthreads(); - row_data[threadIdx.x] = sum; + // Block reduce +# pragma unroll + for (int shift = CU1DBLOCK / 2; shift > warpSize; shift >>= 1) { + if (tid < shift) + ssum[tid] += ssum[tid + shift]; + __syncthreads(); + } - __syncthreads(); + // Warp reduce. Implicitly synchronized within a warp. + if (tid < warpSize) { +# pragma unroll + for (int shift = warpSize; shift > 0; shift >>= 1) { + ssum[tid] += ssum[tid + shift]; + } + } - Real ans = _sum_reduce(row_data); - if (threadIdx.x == 0) - value[blockIdx.x] = ans; + // output 1 sum per thread block + if (tid == 0) { + value[blockIdx.y * gridDim.x + blockIdx.x] = ssum[0]; + } } // Adds diag(M N) to v, where M and N are matrices. We supply row_stride and // col_stride arguments for M and N, and swapping them allows us to transpose -// those matrices. Note: we imagine row-major indexing here, just like Kaldi +// those matrices. Note: we imagine row-major indexing here, just like Kaldi // and CBLAS (but unlike CUBLAS). // This kernel expects the blockDim to be (CU1DBLOCK, 1) and the // gridDim times CU1DBLOCK to be at least num-rows-of-v * threads_per_element. @@ -929,24 +993,24 @@ static void _add_diag_mat_mat( Real alpha, Real* v, int v_dim, const Real* M, int M_cols, int M_row_stride, int M_col_stride, const Real *N, int N_row_stride, int N_col_stride, int threads_per_element, Real beta) { - + // we actually assume blockDim.x == CU1DBLOCK here. // Each diagonal element of v is processed by "threads_per_element" threads. __shared__ Real temp_data[CU1DBLOCK]; int i = blockIdx.x * blockDim.x + threadIdx.x; int v_idx = i / threads_per_element, // v_idx is the index into v that we are supposed to - sub_idx = i % threads_per_element; // add to; 0 <= sub_idx < threads_per_element tells + sub_idx = i % threads_per_element; // add to; 0 <= sub_idx < threads_per_element tells // us which block of elements we sum up. - if (v_idx >= v_dim) return; - - Real sum = 0.0; - for (int j = sub_idx; j < M_cols; j += threads_per_element) { - int M_index = v_idx * M_row_stride + j * M_col_stride, - N_index = j * N_row_stride + v_idx * N_col_stride; - sum += M[M_index] * N[N_index]; + if (v_idx < v_dim) { + Real sum = 0.0; + for (int j = sub_idx; j < M_cols; j += threads_per_element) { + int M_index = v_idx * M_row_stride + j * M_col_stride, + N_index = j * N_row_stride + v_idx * N_col_stride; + sum += M[M_index] * N[N_index]; + } + temp_data[threadIdx.x] = sum; } - temp_data[threadIdx.x] = sum; // start_idx = threadIdx.x - sub_idx; // start of the position in temp_data // that we want to sum up. @@ -966,7 +1030,7 @@ static void _add_diag_mat_mat( __syncthreads(); num_total_threads = half_point; } - if (sub_idx == 0) { + if (sub_idx == 0 && v_idx < v_dim) { v[v_idx] = beta * v[v_idx] + alpha * temp_data[threadIdx.x]; } } @@ -983,18 +1047,6 @@ static void _add_vec_vec(Real alpha, Real* v, const Real* x, const Real* y, Real } -template -__global__ -static void _copy_col_from_mat(Real* v, int col, const Real* mat, MatrixDim dmat, int dim) { - int32_cuda i = blockIdx.x * blockDim.x + threadIdx.x; - int32_cuda index = col + i * dmat.stride; - // if (blockIdx.y > 0) return; - - if (i < dim) - v[i] = mat[index]; -} - - template __global__ static void _copy_col_from_mat_df(double* v, int col, const Real* mat, MatrixDim dmat, int dim) { @@ -1024,10 +1076,10 @@ __global__ static void _vec_apply_exp(Real* v, int dim) { int32_cuda i = blockIdx.x * blockDim.x + threadIdx.x; // if (blockIdx.y > 0) return; - + if (i < dim) { v[i] = exp(v[i]); - } + } } @@ -1036,7 +1088,7 @@ __global__ static void _vec_apply_log(Real* v, Real* flag, int dim) { int32_cuda i = blockIdx.x * blockDim.x + threadIdx.x; // if (blockIdx.y > 0) return; - + if (i < dim) { if (v[i] < 0) { *flag = 1; @@ -1072,10 +1124,10 @@ static void _cuda_comp_obj_deriv(MatrixElement *x, int s, const Real* z, M for(int j = loop_start; j< loop_end; j++) { int m = (x + j)->row; //* ((int*) ((size_t)x + j * (2 * sizeof(int) + sizeof(Real) )) ); int label = (x + j)->column; //*(int*) ((size_t)x + j * (2 * sizeof(int) + sizeof(Real) )+ sizeof(int)); - Real weight = (x + j)->weight; //*(Real*) ((size_t)x + j * (2 * sizeof(int) + sizeof(Real) ) + 2 * sizeof(int)); + Real weight = (x + j)->weight; //*(Real*) ((size_t)x + j * (2 * sizeof(int) + sizeof(Real) ) + 2 * sizeof(int)); tmp_weight_sum += weight; Real this_prob = *(z + m * d.stride + label); - tmp_tot_objf += weight * log(this_prob); + tmp_tot_objf += weight * log(this_prob); *(z2 + m * d2.stride + label ) += weight / this_prob;// there might be problems here.... } @@ -1084,34 +1136,32 @@ static void _cuda_comp_obj_deriv(MatrixElement *x, int s, const Real* z, M __syncthreads(); *t = _sum_reduce(tot_objf); __syncthreads(); - *(t+1) = _sum_reduce(tot_weight); + *(t+1) = _sum_reduce(tot_weight); return; } template __global__ -static void _cuda_matrix_add_elements(Real *data, MatrixDim dim, Real alpha, MatrixElement* x, int s) { - int i = threadIdx.x; +static void _cuda_matrix_add_elements(Real *data, MatrixDim dim, Real alpha, MatrixElement* x, int num_elements) { + int i = blockIdx.x * blockDim.x + threadIdx.x; + if (i >= num_elements) + return; + data[x[i].row * dim.stride + x[i].column] += alpha * x[i].weight; +} + +template +__global__ +static void _cuda_matrix_add_indexed_values(MatrixDim dim, Real alpha, + const Int32Pair* indices, const Real* x, + int s, Real* data) { + int i = blockIdx.x * blockDim.x + threadIdx.x; if (i >= s) return; - int size = s / CU1DBLOCK; //the least size in a loop (later part) - int threshold = s - size * CU1DBLOCK; //any loop below this number would + 1 - - int loop_start; - int loop_end; - if(i < threshold) { - loop_start = i * (size + 1); - loop_end = (i+1) * (size + 1); - } - else { - loop_start = threshold + i*size; - loop_end = threshold + (i+1)*size; - } - for(int j = loop_start; j < loop_end; j++) { - *(data + x[j].row * dim.stride + x[j].column) += alpha * x[j].weight; - } + int data_i = indices[i].first * dim.stride + indices[i].second; + data[data_i] += alpha * x[i]; } + template __global__ static void _matrix_lookup(const Real *data, MatrixDim dim, @@ -1127,8 +1177,8 @@ static void _matrix_lookup(const Real *data, MatrixDim dim, template __global__ static void _equal_element_mask(const Real *mat1, const Real *mat2, Real *mask, MatrixDim mat1_dim, int mat2_stride, int mask_stride) { - int32_cuda i = blockIdx.x * blockDim.x + threadIdx.x; //col - int32_cuda j = blockIdx.y * blockDim.y + threadIdx.y; //row + int32_cuda i = blockIdx.x * blockDim.x + threadIdx.x; // col + int32_cuda j = blockIdx.y * blockDim.y + threadIdx.y; // row int32_cuda index_mat1 = i + j*mat1_dim.stride; int32_cuda index_mat2 = i + j*mat2_stride; int32_cuda index_mask = i + j*mask_stride; @@ -1140,10 +1190,10 @@ template __global__ static void _vec_sum(Real *v, Real *sum, int dim, int inc) { int i = threadIdx.x; - __shared__ Real row_data[CU1DBLOCK]; + __shared__ Real row_data[CU1DBLOCK]; if (i >= CU1DBLOCK) return; - + Real tmp_sum = 0; int size = dim / CU1DBLOCK; //the least size in a loop (later part) int threshold = dim - size * CU1DBLOCK; //any loop below this number would + 1 @@ -1161,7 +1211,7 @@ static void _vec_sum(Real *v, Real *sum, int dim, int inc) { for(int j = loop_start; j< loop_end; j++) { tmp_sum += v[j * inc]; } - + row_data[threadIdx.x] = tmp_sum; __syncthreads(); *sum = _sum_reduce(row_data); @@ -1173,7 +1223,6 @@ __global__ static void _pvec_sum(Real* v, Real* g, int dim, int size) { int i = blockIdx.x * blockDim.x + threadIdx.x; int start = size * i; - if (start >= dim) return; int end = start + size; if (end > dim) end = dim; __shared__ Real row_data[CU1DBLOCK]; @@ -1191,7 +1240,7 @@ template __global__ static void _vec_apply_floor(Real *v, Real floor_val, float *count, int dim) { int i = blockIdx.x * blockDim.x + threadIdx.x; - + if ( i < dim) { if ( v[i] < floor_val) { v[i] = floor_val; @@ -1202,18 +1251,28 @@ static void _vec_apply_floor(Real *v, Real floor_val, float *count, int dim) { } } - -// Caution, here i/block{idx,dim}.x is the row index and j/block{idx,dim}.y is the col index. -// this is for no reason, really, I just happened to prefer this -// at the time. [dan] template __global__ -static void _apply_pow(Real* mat, Real power, MatrixDim d) { +static void _vec_apply_ceiling(Real *v, Real ceiling_val, float *count, int dim) { int i = blockIdx.x * blockDim.x + threadIdx.x; - int j = blockIdx.y * blockDim.y + threadIdx.y; - int index = i * d.stride + j; - if (i < d.rows && j < d.cols) { + if ( i < dim) { + if ( v[i] > ceiling_val) { + v[i] = ceiling_val; + count[i] = 1; + } else { + count[i] = 0; + } + } +} + +template +__global__ +static void _apply_pow(Real* mat, Real power, MatrixDim d) { + int i = blockIdx.x * blockDim.x + threadIdx.x; // col index + int j = blockIdx.y * blockDim.y + threadIdx.y; // row index + int index = i + j * d.stride; + if (i < d.cols && j < d.rows) { if (power == 1.0) return; if (power == 2.0) { @@ -1231,13 +1290,12 @@ static void _apply_pow(Real* mat, Real power, MatrixDim d) { template __global__ static void _apply_pow_abs(Real* mat, Real power, bool include_sign, MatrixDim d) { - int i = blockIdx.x * blockDim.x + threadIdx.x; - int j = blockIdx.y * blockDim.y + threadIdx.y; - int index = i * d.stride + j; - - if (i < d.rows && j < d.cols) { + int i = blockIdx.x * blockDim.x + threadIdx.x; // col index + int j = blockIdx.y * blockDim.y + threadIdx.y; // row index + int index = i + j * d.stride; + if (i < d.cols && j < d.rows) { if (include_sign == true && mat[index] < 0) { - if (power == 1.0) + if (power == 1.0) mat[index] = -std::abs(mat[index]); if (power == 2.0) { mat[index] = -mat[index] * mat[index]; @@ -1247,7 +1305,7 @@ static void _apply_pow_abs(Real* mat, Real power, bool include_sign, MatrixDim d mat[index] = -pow(std::abs(mat[index]), power); } } else { - if (power == 1.0) + if (power == 1.0) mat[index] = std::abs(mat[index]); if (power == 2.0) { mat[index] = mat[index] * mat[index]; @@ -1262,27 +1320,22 @@ static void _apply_pow_abs(Real* mat, Real power, bool include_sign, MatrixDim d } } -// Caution, here i/block{idx,dim}.x is the row index and j/block{idx,dim}.y is the col index. -// this is for no reason, really, I just happened to prefer this -// at the time. [dan] template __global__ static void _apply_heaviside(Real* mat, MatrixDim d) { - int i = blockIdx.x * blockDim.x + threadIdx.x; - int j = blockIdx.y * blockDim.y + threadIdx.y; - int index = i * d.stride + j; - - if (i < d.rows && j < d.cols) { + int i = blockIdx.x * blockDim.x + threadIdx.x; // col index + int j = blockIdx.y * blockDim.y + threadIdx.y; // row index + int index = i + j * d.stride; + if (i < d.cols && j < d.rows) mat[index] = (mat[index] > 0.0 ? 1.0 : 0.0); - } } template __global__ static void _apply_floor(Real* mat, Real floor_val, MatrixDim d) { - int i = blockIdx.x * blockDim.x + threadIdx.x; - int j = blockIdx.y * blockDim.y + threadIdx.y; + int i = blockIdx.x * blockDim.x + threadIdx.x; // col index + int j = blockIdx.y * blockDim.y + threadIdx.y; // row index int index = i + j * d.stride; if (i < d.cols && j < d.rows) { @@ -1295,57 +1348,50 @@ static void _apply_floor(Real* mat, Real floor_val, MatrixDim d) { template __global__ static void _copy_cols(Real* dst, const Real *src, const MatrixIndexT_cuda* reorder, MatrixDim dst_dim, int src_stride) { - // Note: in this kernel, the x dimension corresponds to rows and the y to columns, - // as it will be going forward. - - int i = blockIdx.x * blockDim.x + threadIdx.x; - int j = blockIdx.y * blockDim.y + threadIdx.y; - if (i < dst_dim.rows && j < dst_dim.cols) { - int index = reorder[j], - dst_index = i * dst_dim.stride + j; + int i = blockIdx.x * blockDim.x + threadIdx.x; // col index + int j = blockIdx.y * blockDim.y + threadIdx.y; // row index + if (i < dst_dim.cols && j < dst_dim.rows) { + int index = reorder[i], + dst_index = j * dst_dim.stride + i; if (index >= 0) { - int src_index = i * src_stride + reorder[j]; - Real val = src[src_index]; + int src_index = j * src_stride + reorder[i]; + Real val = src[src_index]; dst[dst_index] = val; } else { dst[dst_index] = 0.0; } - } + } } template __global__ -static void _add_cols(Real* dst, const Real *src, const MatrixIndexT_cuda* reorder, MatrixDim dst_dim, int src_stride) { - // Note: in this kernel, the x dimension corresponds to rows and the y to columns, - // as it will be going forward. - - int i = blockIdx.x * blockDim.x + threadIdx.x; - int j = blockIdx.y * blockDim.y + threadIdx.y; - if (i < dst_dim.rows && j < dst_dim.cols) { - int index = reorder[j], - dst_index = i * dst_dim.stride + j; +static void _add_cols(Real* dst, const Real *src, const MatrixIndexT_cuda* reorder, + MatrixDim dst_dim, int src_stride) { + int i = blockIdx.x * blockDim.x + threadIdx.x; // col index + int j = blockIdx.y * blockDim.y + threadIdx.y; // row index + if (i < dst_dim.cols && j < dst_dim.rows) { + int index = reorder[i], + dst_index = j * dst_dim.stride + i; if (index >= 0) { - int src_index = i * src_stride + reorder[j]; - Real val = src[src_index]; + int src_index = j * src_stride + index; + Real val = src[src_index]; dst[dst_index] += val; } - } + } } template __global__ -static void _copy_rows(Real* dst, const Real *src, const MatrixIndexT_cuda* reorder, MatrixDim dst_dim, int src_stride) { - // Note: in this kernel, the x dimension corresponds to rows and the y to columns, - // as it will be going forward. - - int i = blockIdx.x * blockDim.x + threadIdx.x; - int j = blockIdx.y * blockDim.y + threadIdx.y; - if (i < dst_dim.rows && j < dst_dim.cols) { - int index = reorder[i], - dst_index = i * dst_dim.stride + j; +static void _copy_rows(Real* dst, const Real *src, const MatrixIndexT_cuda* reorder, + MatrixDim dst_dim, int src_stride) { + int i = blockIdx.x * blockDim.x + threadIdx.x; // col index + int j = blockIdx.y * blockDim.y + threadIdx.y; // row index + if (i < dst_dim.cols && j < dst_dim.rows) { + int index = reorder[j], + dst_index = j * dst_dim.stride + i; if (index >= 0) { - int src_index = reorder[i] * src_stride + j; - Real val = src[src_index]; + int src_index = reorder[j] * src_stride + i; + Real val = src[src_index]; dst[dst_index] = val; } else { dst[dst_index] = 0; @@ -1356,12 +1402,13 @@ static void _copy_rows(Real* dst, const Real *src, const MatrixIndexT_cuda* reor template __global__ static void _copy_rows(Real* dst, const Real *const *src, MatrixDim dst_dim) { - int i = blockIdx.x * blockDim.x + threadIdx.x; - int j = blockIdx.y * blockDim.y + threadIdx.y; - if (i < dst_dim.rows && j < dst_dim.cols) { - int dst_index = i * dst_dim.stride + j; - if (src[i] != NULL) { - dst[dst_index] = src[i][j]; + int i = blockIdx.x * blockDim.x + threadIdx.x; // col index + int j = blockIdx.y * blockDim.y + threadIdx.y; // row index + if (i < dst_dim.cols && j < dst_dim.rows) { + int dst_index = j * dst_dim.stride + i; + const Real *pointer = src[j]; + if (pointer != NULL) { + dst[dst_index] = pointer[i]; } else { dst[dst_index] = 0; } @@ -1372,11 +1419,12 @@ template __global__ static void _copy_to_rows(Real* const* dst, const Real *src, MatrixDim src_dim) { - int i = blockIdx.x * blockDim.x + threadIdx.x; - int j = blockIdx.y * blockDim.y + threadIdx.y; - if (i < src_dim.rows && j < src_dim.cols) { - if (dst[i] != NULL) { - dst[i][j] = src[i * src_dim.stride + j]; + int i = blockIdx.x * blockDim.x + threadIdx.x; // col index + int j = blockIdx.y * blockDim.y + threadIdx.y; // row index + if (i < src_dim.cols && j < src_dim.rows) { + Real *pointer = dst[j]; + if (pointer != NULL) { + pointer[i] = src[j * src_dim.stride + i]; } } } @@ -1386,27 +1434,27 @@ __global__ static void _add_rows(Real alpha, Real* dst, const Real *src, const MatrixIndexT_cuda* reorder, MatrixDim dst_dim, int src_stride) { - int i = blockIdx.x * blockDim.x + threadIdx.x; - int j = blockIdx.y * blockDim.y + threadIdx.y; - if (i < dst_dim.rows && j < dst_dim.cols) { - int dst_index = i * dst_dim.stride + j; - if (reorder[i] >= 0) { - int src_index = reorder[i] * src_stride + j; + int i = blockIdx.x * blockDim.x + threadIdx.x; // col index + int j = blockIdx.y * blockDim.y + threadIdx.y; // row index + if (i < dst_dim.cols && j < dst_dim.rows) { + int dst_index = j * dst_dim.stride + i; + if (reorder[j] >= 0) { + int src_index = reorder[j] * src_stride + i; dst[dst_index] += alpha * src[src_index]; } - } + } } template __global__ static void _add_rows(Real alpha, Real* dst, const Real *const *src, MatrixDim dst_dim) { - int i = blockIdx.x * blockDim.x + threadIdx.x; - int j = blockIdx.y * blockDim.y + threadIdx.y; - if (i < dst_dim.rows && j < dst_dim.cols) { - int dst_index = i * dst_dim.stride + j; - if (src[i] != NULL) { - dst[dst_index] += alpha * src[i][j]; + int i = blockIdx.x * blockDim.x + threadIdx.x; // col index + int j = blockIdx.y * blockDim.y + threadIdx.y; // row index + if (i < dst_dim.cols && j < dst_dim.rows) { + int dst_index = j * dst_dim.stride + i; + if (src[j] != NULL) { + dst[dst_index] += alpha * src[j][i]; } } } @@ -1415,11 +1463,11 @@ template __global__ static void _add_to_rows(Real alpha, Real* const* dst, const Real *src, MatrixDim src_dim) { - int i = blockIdx.x * blockDim.x + threadIdx.x; - int j = blockIdx.y * blockDim.y + threadIdx.y; - if (i < src_dim.rows && j < src_dim.cols) { - if (dst[i] != NULL) { - dst[i][j] += alpha * src[i * src_dim.stride + j]; + int i = blockIdx.x * blockDim.x + threadIdx.x; // col index + int j = blockIdx.y * blockDim.y + threadIdx.y; // row index + if (i < src_dim.cols && j < src_dim.rows) { + if (dst[j] != NULL) { + dst[j][i] += alpha * src[j * src_dim.stride + i]; } } } @@ -1471,7 +1519,7 @@ static void _add_mat_blockmat_trans(Real *data, MatrixDim dim, const Real *A_dat BT_col_stride = cu_data.matrix_dim.stride; const Real *B_data = static_cast(cu_data.matrix_data); // Cast from void; // we avoided a bunch of hassle by doing this (relates to Ansi-C requirement). - + for (int k = 0; k < BT_num_cols; k++) { const Real *this_BT_col = B_data + k * BT_col_stride; const Real *this_A_row = A_data + i * A_row_stride + BT_row_start * A_col_stride; @@ -1496,7 +1544,7 @@ static void _add_mat_blockmat(Real *data, MatrixDim dim, const Real *A_data, int if (i >= A_num_rows || j >= B_num_blocks) return; const CuBlockMatrixData &block_data = B_cu_data[j]; - + int B_row_start = block_data.row_offset, B_col_start = block_data.col_offset, B_num_rows = block_data.matrix_dim.rows, @@ -1504,7 +1552,7 @@ static void _add_mat_blockmat(Real *data, MatrixDim dim, const Real *A_data, int B_row_stride = block_data.matrix_dim.stride; const Real *B_data = static_cast(block_data.matrix_data); // Cast from void; // we avoided a bunch of hassle by doing this (relates to Ansi-C requirement). - + for (int k = 0; k < B_num_cols; k++) { const Real *this_B_col = B_data + k; const Real *this_A_row = A_data + i * A_row_stride + B_row_start * A_col_stride; @@ -1551,7 +1599,7 @@ static void _block_add_mat_mat(CuBlockMatrixData *B_cu_data, int num_blocks, i * block_data.matrix_dim.stride + j; Real B_val = *B_elem; - + // B_row and B_col are the (row, col) index into the full matrix B. int B_row = block_data.row_offset + i, B_col = block_data.col_offset + j; @@ -1585,7 +1633,7 @@ static void _blockadd_mat_blockmat_trans(Real *data, MatrixDim dim, const Real * BT_col_stride = cu_data.matrix_dim.stride; const Real *B_data = static_cast(cu_data.matrix_data); // Cast from void; // we avoided a bunch of hassle by doing this (relates to Ansi-C requirement). - + for (int k = 0; k < BT_num_cols; k++) { const Real *this_BT_col = B_data + k * BT_col_stride; const Real *this_A_row = A_data + i * A_row_stride + BT_row_start * A_col_stride; @@ -1600,17 +1648,14 @@ static void _blockadd_mat_blockmat_trans(Real *data, MatrixDim dim, const Real * } } - -// Since this is a newer kernel, x is the row-index and y is the -// column-index. template __global__ static void _sum_column_ranges(Real *data, MatrixDim dim, const Real *src_data, MatrixDim src_dim, const Int32Pair *indices) { - int row = blockIdx.x * blockDim.x + threadIdx.x; - int col = blockIdx.y * blockDim.y + threadIdx.y; + int col = blockIdx.x * blockDim.x + threadIdx.x; + int row = blockIdx.y * blockDim.y + threadIdx.y; if (row >= dim.rows || col >= dim.cols) return; int dst_index = row * dim.stride + col, @@ -1626,15 +1671,16 @@ template __global__ static void _add_row_ranges(Real *data, MatrixDim dim, const Real *src_data, MatrixDim src_dim, const Int32Pair *indexes) { - int row = blockIdx.x * blockDim.x + threadIdx.x; - int col = blockIdx.y * blockDim.y + threadIdx.y; + int col = blockIdx.x * blockDim.x + threadIdx.x; + int row = blockIdx.y * blockDim.y + threadIdx.y; if (row >= dim.rows || col >= dim.cols) return; int dst_index = row * dim.stride + col; - for (int row_index = indexes[col].first; - row_index < indexes[col].second; row_index++) { + int src_index_start = indexes[row].first, + src_index_end = indexes[row].second; + for (int row_index = src_index_start; row_index < src_index_end; + row_index++) data[dst_index] += src_data[row_index * src_dim.stride + col]; - } } template @@ -1654,7 +1700,7 @@ static void _soft_hinge(Real*y, const Real*x, MatrixDim d, int src_stride) { template __global__ -static void _group_pnorm(Real *y, const Real *x, MatrixDim d, int src_stride, +static void _group_pnorm(Real *y, const Real *x, MatrixDim d, int src_stride, int group_size, Real power) { int i = blockIdx.x * blockDim.x + threadIdx.x; int j = blockIdx.y * blockDim.y + threadIdx.y; @@ -1663,20 +1709,20 @@ static void _group_pnorm(Real *y, const Real *x, MatrixDim d, int src_stride, Real tmp = 0; int src_begin_index = i * group_size + j * src_stride; int src_end_index = src_begin_index + group_size; - for (int src_index = src_begin_index; src_index < src_end_index; + for (int src_index = src_begin_index; src_index < src_end_index; src_index ++) { - tmp += pow(std::abs(x[src_index]), power); + tmp += pow(std::abs(x[src_index]), power); } tmp = pow(tmp, Real(1.0 / power)); if (!isnan(tmp)) { y[dst_index] = tmp; } else { Real max_value = x[src_begin_index], min_value = max_value; - for (int src_index = src_begin_index + 1; + for (int src_index = src_begin_index + 1; src_index < src_end_index; src_index ++) { - if (x[src_index] > max_value) + if (x[src_index] > max_value) max_value = x[src_index]; - if (x[src_index] < min_value) + if (x[src_index] < min_value) min_value = x[src_index]; } tmp = 0.0; @@ -1689,7 +1735,7 @@ static void _group_pnorm(Real *y, const Real *x, MatrixDim d, int src_stride, for (int src_index = src_begin_index; src_index < src_end_index; src_index ++) { Real x_scaled = x[src_index] / max_abs_value; - tmp += pow(std::abs(x_scaled), Real(power)); + tmp += pow(std::abs(x_scaled), Real(power)); } y[dst_index] = pow(tmp, Real(1.0 / power)) * max_abs_value; } @@ -1740,7 +1786,7 @@ static void _diff_sigmoid(Real*eout, const Real*e, const Real*y, MatrixDim d, in int dst_index = i + j*d.stride; int e_index = i + j*e_stride; int y_index = i + j*y_stride; - if (i < d.cols && j < d.rows ) + if (i < d.cols && j < d.rows ) eout[dst_index] = y[y_index]*(1.0-y[y_index]) * e[e_index]; } @@ -1769,13 +1815,26 @@ __global__ static void _diff_tanh(Real*eout, const Real*e, const Real*y, MatrixDim d, int e_stride, int y_stride) { int i = blockIdx.x * blockDim.x + threadIdx.x; int j = blockIdx.y * blockDim.y + threadIdx.y; - int dst_index = i + j*d.stride; - int e_index = i + j*e_stride; + int dst_index = i + j*d.stride; + int e_index = i + j*e_stride; int y_index = i + j*y_stride; - if (i < d.cols && j < d.rows ) + if (i < d.cols && j < d.rows ) eout[dst_index] = (1.0 - y[y_index]*y[y_index]) * e[e_index]; } +template +__global__ +static void _heaviside(Real*y, const Real*x, MatrixDim d, int src_stride) { + int i = blockIdx.x * blockDim.x + threadIdx.x; + int j = blockIdx.y * blockDim.y + threadIdx.y; + int dst_index = i + j*d.stride, src_index = i + j*src_stride; + if(i < d.cols && j < d.rows) { + Real res = (x[src_index] > 0.0 ? 1.0 : 0.0); + y[dst_index] = res; + } +} + + template __global__ static void _softmax_reduce(Real*y, const Real*x, MatrixDim d, int src_stride) { @@ -1809,7 +1868,7 @@ static void _softmax_reduce(Real*y, const Real*x, MatrixDim d, int src_stride) { } Real max = aux[0]; __syncthreads(); - + // subtract max, apply exp, sum up... y[threadIdx.x+j*d.stride] = exp(x[threadIdx.x+j*d.stride] - max); aux[threadIdx.x] = y[threadIdx.x+j*d.stride]; @@ -1980,10 +2039,10 @@ static void _vec_copy_diag_from_packed(Real* y, const Real* x, int dim) { template __global__ static void _copy_from_sp(const Real* x, Real* y, MatrixDim dim) { - int i = blockIdx.x * blockDim.x + threadIdx.x; - int j = blockIdx.y * blockDim.y + threadIdx.y; - if (i < dim.rows && j < dim.cols) { - int dst_index = i * dim.stride + j, src_index; + int i = blockIdx.x * blockDim.x + threadIdx.x; // column index + int j = blockIdx.y * blockDim.y + threadIdx.y; // + if (i < dim.cols && j < dim.rows) { + int dst_index = i + j * dim.stride, src_index; if (j <= i) { // no transpose src_index = (i * (i+1) / 2) + j; } else { // transpose. @@ -2041,7 +2100,7 @@ static void _regularize_l1(Real* wei, Real* grad, Real l1, Real lr, MatrixDim d, if (i < d.cols && j < d.rows) { if(wei[index]==0.0) return; //skip L1 if zero weight! - + Real l1_signed = l1; if(wei[index] < 0.0) //flip sign l1_signed = -l1; @@ -2057,36 +2116,63 @@ static void _regularize_l1(Real* wei, Real* grad, Real l1, Real lr, MatrixDim d, } } - - template __global__ -static void _find_row_max_id(const Real* mat, Real* vec_val, int32_cuda* vec_id, int32_cuda voff, MatrixDim d) { - int32_cuda i = blockIdx.x * blockDim.x + threadIdx.x; - int32_cuda j = blockIdx.y * blockDim.y + threadIdx.y; +static void _find_row_max_id(const Real* mat, Real* vec_val, int32_cuda* vec_id, + MatrixDim d) { + const int32_cuda i = blockIdx.x; + const int32_cuda base = i * d.stride; + const int32_cuda tid = threadIdx.x; - if(blockIdx.x > 0) return; - if(blockDim.y != 1) return; + __shared__ Real smax[CU1DBLOCK]; + __shared__ int32_cuda sidx[CU1DBLOCK]; - __shared__ Real value[CU1DBLOCK]; - __shared__ int32_cuda index[CU1DBLOCK]; + Real tmax = -1e20; + int32_cuda tidx = -1; - //copy to shared memory - value[threadIdx.x] = mat[i+j*d.stride]; - index[threadIdx.x] = threadIdx.x; - __syncthreads(); - - //get the id of the max value - int32_cuda out_max = _max_id_reduce(value, index); - __syncthreads(); + // Loop over blocks for coalesced memory access. + for (int32_cuda j = tid; j < d.cols; j += CU1DBLOCK) { + const Real val = mat[base + j]; + if (val > tmax) { + tmax = val; + tidx = j; + } + } - //see if it's bigger value - if(threadIdx.x == 0) { - if(vec_val[j] <= mat[out_max+j*d.stride]) { - vec_val[j] = mat[out_max+j*d.stride]; - vec_id[j] = voff+out_max; + smax[tid] = tmax; + sidx[tid] = tidx; + + // Parallel reduce + #pragma unroll + for (int32_cuda num_working_threads = CU1DBLOCK / 2; + num_working_threads >= warpSize; num_working_threads >>= 1) { + __syncthreads(); + if (tid < num_working_threads) { + if (smax[tid + num_working_threads] > smax[tid]) { + smax[tid] = smax[tid + num_working_threads]; + sidx[tid] = sidx[tid + num_working_threads]; + } } } + // Warp reduce without __syncthreads() + // (note.: synchronizes implicitly within a warp at the multiprocessor) + if (tid < warpSize / 2) { + #pragma unroll + for (int32_cuda num_working_threads = warpSize / 2; num_working_threads > 0; + num_working_threads >>= 1) { + if (smax[tid + num_working_threads] > smax[tid]) { + smax[tid] = smax[tid + num_working_threads]; + sidx[tid] = sidx[tid + num_working_threads]; + } + } + } + + if (tid == 0) { + if (vec_val) { + vec_val[i] = smax[0]; + } + vec_id[i] = sidx[0]; + } } @@ -2113,10 +2199,13 @@ static void _diff_xent(const int32_cuda* vec_tgt, Real* mat_net_out, Real* vec_l */ /* - * "int32" + * "int32" */ -void cudaI32_set_const(dim3 Gr, dim3 Bl, int32_cuda* mat, int32_cuda value, MatrixDim d) { - _set_const<<>>(mat,value,d); +void cuda_int32_set_const(dim3 Gr, dim3 Bl, int32_cuda* mat, int32_cuda value, MatrixDim d) { + _set_const<<>>(mat,value,d); +} +void cuda_int32_add(dim3 Gr, dim3 Bl, int32_cuda* mat, int32_cuda value, MatrixDim d) { + _add<<>>(mat,value,d); } @@ -2151,11 +2240,6 @@ void cudaFD_copy_from_tp(dim3 Gr, dim3 Bl, float* A, const double* B, MatrixDim _copy_from_tp<<>>(A,B,dmat); } - -void cudaF_copy_col_from_vec(int Gr, int Bl, float* mat, const float* v, int col, MatrixDim d) { - _copy_col_from_vec<<>>(mat,v,col,d); -} - void cudaF_transpose_matrix(dim3 Gr, dim3 Bl, float* mat, MatrixDim d) { _transpose_matrix<<>>(mat, d); } @@ -2174,7 +2258,6 @@ void cudaF_apply_pow_abs(dim3 Gr, dim3 Bl, float* mat, float power, bool include void cudaF_apply_heaviside(dim3 Gr, dim3 Bl, float* mat, MatrixDim d) { _apply_heaviside<<>>(mat, d); - } void cudaF_copy_cols(dim3 Gr, dim3 Bl, float* dst, const float* src, const MatrixIndexT_cuda* reorder, MatrixDim dst_dim, int src_stride) { @@ -2230,7 +2313,7 @@ void cudaF_add_diag_packed(int Gr, int Bl, float* mat, float value, int dim) { } void cudaF_set_const(dim3 Gr, dim3 Bl, float* mat, float value, MatrixDim d) { - _set_const<<>>(mat,value,d); + _set_const<<>>(mat,value,d); } void cudaF_set_zero_above_diag(dim3 Gr, dim3 Bl, float* mat, MatrixDim d) { @@ -2238,7 +2321,7 @@ void cudaF_set_zero_above_diag(dim3 Gr, dim3 Bl, float* mat, MatrixDim d) { } void cudaF_add(dim3 Gr, dim3 Bl, float* mat, float value, MatrixDim d) { - _add<<>>(mat,value,d); + _add<<>>(mat,value,d); } void cudaF_scale_diag_packed(int Gr, int Bl, float* mat, float value, int dim) { @@ -2246,45 +2329,45 @@ void cudaF_scale_diag_packed(int Gr, int Bl, float* mat, float value, int dim) { } void cudaF_scale(dim3 Gr, dim3 Bl, float* mat, float value, MatrixDim d) { - _scale<<>>(mat,value,d); + _scale<<>>(mat,value,d); } void cudaF_apply_log(dim3 Gr, dim3 Bl, float* mat, MatrixDim d) { - _apply_log<<>>(mat,d); + _apply_log<<>>(mat,d); } void cudaF_mul_elements(dim3 Gr, dim3 Bl, float* mat, const float* A, MatrixDim dst_d, int src_stride) { - _mul_elements<<>>(mat,A,dst_d,src_stride); + _mul_elements<<>>(mat,A,dst_d,src_stride); } void cudaF_div_elements(dim3 Gr, dim3 Bl, float* mat, const float* A, MatrixDim dst_d, int src_stride) { - _div_elements<<>>(mat,A,dst_d,src_stride); + _div_elements<<>>(mat,A,dst_d,src_stride); } void cudaF_max(dim3 Gr, dim3 Bl, float* mat, const float* A, MatrixDim dst_d, int src_stride) { - _max<<>>(mat,A,dst_d,src_stride); + _max<<>>(mat,A,dst_d,src_stride); } void cudaF_mul_cols_vec(dim3 Gr, dim3 Bl, float* mat, const float* scale, MatrixDim d) { - _mul_cols_vec<<>>(mat,scale,d); + _mul_cols_vec<<>>(mat,scale,d); } void cudaF_mul_rows_vec(dim3 Gr, dim3 Bl, float* mat, const float* scale, MatrixDim d) { _mul_rows_vec<<>>(mat,scale,d); } -void cudaF_mul_rows_group_mat(dim3 Gr, dim3 Bl, float *y, const float *x, +void cudaF_mul_rows_group_mat(dim3 Gr, dim3 Bl, float *y, const float *x, MatrixDim d, int src_stride, int group_size) { _mul_rows_group_mat<<>>(y, x, d, src_stride, group_size); } -void cudaF_calc_pnorm_deriv(dim3 Gr, dim3 Bl, float *y, const float *x1, +void cudaF_calc_pnorm_deriv(dim3 Gr, dim3 Bl, float *y, const float *x1, const float *x2, MatrixDim d, int src_stride, int group_size, float power) { _calc_pnorm_deriv<<>>(y, x1, x2, d, src_stride, group_size, power); } -void cudaF_calc_group_max_deriv(dim3 Gr, dim3 Bl, float *y, const float *x1, +void cudaF_calc_group_max_deriv(dim3 Gr, dim3 Bl, float *y, const float *x1, const float *x2, MatrixDim d, int src_stride, int group_size) { _calc_group_max_deriv<<>>(y, x1, x2, d, src_stride, group_size); @@ -2296,12 +2379,20 @@ void cudaF_div_rows_vec(dim3 Gr, dim3 Bl, float* mat, const float* vec_div, Matr void cudaF_add_mat(dim3 Gr, dim3 Bl, float alpha, const float* src, float* dst, MatrixDim d, int src_stride, int A_trans) { if (A_trans) { - _add_mat_trans<<>>(alpha,src,dst,d,src_stride); + _add_mat_trans<<>>(alpha,src,dst,d,src_stride); } else { _add_mat<<>>(alpha,src,dst,d,src_stride); } } +void cudaF_add_mat_blocks(dim3 Gr, dim3 Bl, float alpha, const float* src, int32_cuda num_row_blocks, int32_cuda num_col_blocks, float* dst, MatrixDim d, int src_stride, int A_trans) { + if (A_trans) { + _add_mat_blocks_trans<<>>(alpha, src, num_row_blocks, num_col_blocks, dst, d, src_stride); + } else { + _add_mat_blocks<<>>(alpha, src, num_row_blocks, num_col_blocks, dst, d, src_stride); + } +} + void cudaF_add_mat_mat_div_mat(dim3 Gr, dim3 Bl, const float *A, const float *B, const float *C, float *dst, MatrixDim d, int stride_a, int stride_b, int stride_c) { _add_mat_mat_div_mat<<>>(A,B,C,dst,d, stride_a, stride_b, stride_c); } @@ -2312,12 +2403,12 @@ void cudaF_sy_add_tr2(dim3 Gr, dim3 Bl, float alpha, float beta, const float* T, } void cudaF_add_vec_to_cols(dim3 Gr, dim3 Bl, float alpha, const float* col, float beta, float* dst, MatrixDim d) { - _add_vec_to_cols<<>>(alpha,col,beta,dst,d); + _add_vec_to_cols<<>>(alpha,col,beta,dst,d); } void cudaF_add_vec_to_rows(dim3 Gr, dim3 Bl, float alpha, const float* row, float beta, float* dst, MatrixDim d) { - _add_vec_to_rows<<>>(alpha,row,beta,dst,d); + _add_vec_to_rows<<>>(alpha,row,beta,dst,d); } void cudaF_add_mat_diag_vec(dim3 Gr, dim3 Bl, float alpha, float *mat, MatrixDim mat_dim, const float *mat2, int mat2_row_stride, int mat2_col_stride, const float *vec, float beta) { @@ -2331,7 +2422,7 @@ void cudaF_add_mat_mat_elements(dim3 Gr, dim3 Bl, float *data, const float *srcA // CURRENTLY UNUSED... void cudaF_apply_mask(dim3 Gr, dim3 Bl, float* mat, const char* mask, MatrixDim dmat, MatrixDim dmask) { - _apply_mask<<>>(mat,mask,dmat,dmask); + _apply_mask<<>>(mat,mask,dmat,dmask); } @@ -2367,17 +2458,17 @@ void cudaF_vec_max(const float* v, float* value, int dim) { _vec_max<<<1,CU1DBLOCK>>>(v, value, dim); } -void cudaF_trace_mat_mat_trans(const float* A, const float* B, MatrixDim dA, int B_stride, float* value) { - _trace_mat_mat_trans <<<4,CU1DBLOCK>>>(A,B,dA,B_stride,value); +void cudaF_trace_mat_mat_trans(dim3 Gr, dim3 Bl, const float* A, const float* B, MatrixDim dA, int B_stride, float* value) { + _trace_mat_mat_trans<<>>(A,B,dA,B_stride,value); } -void cudaF_trace_mat_mat(const float* A, const float* B, MatrixDim dA, int B_stride, float* value) { - _trace_mat_mat <<<2,CU1DBLOCK>>>(A,B,dA,B_stride,value); +void cudaF_trace_mat_mat(dim3 Gr, dim3 Bl, const float* A, const float* B, MatrixDim dA, int B_stride, float* value) { + _trace_mat_mat<32><<>>(A,B,dA,B_stride,value); } -void cudaF_add_diag_mat_mat(int Gr, int Bl, float alpha, float* v, int v_dim, const float* M, - int M_cols, int M_row_stride, int M_col_stride, const float *N, int N_row_stride, +void cudaF_add_diag_mat_mat(int Gr, int Bl, float alpha, float* v, int v_dim, const float* M, + int M_cols, int M_row_stride, int M_col_stride, const float *N, int N_row_stride, int N_col_stride, int threads_per_element, float beta) { _add_diag_mat_mat<<>>(alpha, v, v_dim, M, M_cols, M_row_stride, M_col_stride, N, N_row_stride, N_col_stride, threads_per_element, beta); @@ -2395,8 +2486,12 @@ void cudaF_pvec_sum(int Gr, int Bl, float* v, float* pvec_sum, int dim, int size _pvec_sum<<>>(v, pvec_sum, dim, size); } -void cudaF_matrix_add_elements(dim3 Gr, dim3 Bl, float *data, MatrixDim dim, float alpha, MatrixElement* x, int s) { - _cuda_matrix_add_elements<<>>(data, dim, alpha, x, s); +void cudaF_matrix_add_elements(dim3 Gr, dim3 Bl, float *data, MatrixDim dim, float alpha, MatrixElement* x, int num_elements) { + _cuda_matrix_add_elements<<>>(data, dim, alpha, x, num_elements); +} + +void cudaF_matrix_add_indexed_values(dim3 Gr, dim3 Bl, MatrixDim dim, float alpha, const Int32Pair* indices, const float* x, int s, float* data) { + _cuda_matrix_add_indexed_values<<>>(dim, alpha, indices, x, s, data); } void cudaF_comp_obj_deriv(dim3 Gr, dim3 Bl, MatrixElement* x, int s, const float* z, MatrixDim d, float* z2, MatrixDim d2, float* t) { @@ -2415,6 +2510,10 @@ void cudaF_vec_apply_floor(int Gr, int Bl, float* v, float floor_val, float *cou _vec_apply_floor<<>>(v,floor_val,count,dim); } +void cudaF_vec_apply_ceiling(int Gr, int Bl, float* v, float ceiling_val, float *count, int dim) { + _vec_apply_ceiling<<>>(v, ceiling_val,count,dim); +} + void cudaF_vec_apply_exp(int Gr, int Bl, float* v, int dim) { _vec_apply_exp<<>>(v,dim); } @@ -2440,7 +2539,7 @@ void cudaF_add_mat_blockmat(dim3 Gr, dim3 Bl, float *data, MatrixDim d, const fl _add_mat_blockmat<<>>(data, d, Adata, A_num_rows, A_num_cols, A_row_stride, A_col_stride, B_cu_data, B_num_blocks, alpha, beta); - + } } @@ -2457,7 +2556,7 @@ void cudaF_block_add_mat_mat(dim3 Gr, dim3 Bl, CuBlockMatrixData *B_cu_data, int * cu:: */ void cudaF_soft_hinge (dim3 Gr, dim3 Bl, float* y, const float* x, MatrixDim d, int src_stride) { - _soft_hinge<<>>(y, x, d, src_stride); + _soft_hinge<<>>(y, x, d, src_stride); } void cudaF_group_pnorm(dim3 Gr, dim3 Bl, float *y, const float *x, MatrixDim d, int src_stride, int group_size, float power) { @@ -2469,7 +2568,7 @@ void cudaF_group_max(dim3 Gr, dim3 Bl, float *y, const float *x, MatrixDim d, in } void cudaF_sigmoid (dim3 Gr, dim3 Bl, float* y, const float* x, MatrixDim d, int src_stride) { - _sigmoid<<>>(y, x, d, src_stride); + _sigmoid<<>>(y, x, d, src_stride); } void cudaF_diff_sigmoid (dim3 Gr, dim3 Bl, float* eout, const float* e, const float* y, MatrixDim d, int e_stride, int y_stride) { @@ -2477,13 +2576,17 @@ void cudaF_diff_sigmoid (dim3 Gr, dim3 Bl, float* eout, const float* e, const fl } void cudaF_tanh (dim3 Gr, dim3 Bl, float* y, const float* x, MatrixDim d, int src_stride) { - _tanh<<>>(y, x, d, src_stride); + _tanh<<>>(y, x, d, src_stride); } void cudaF_diff_tanh (dim3 Gr, dim3 Bl, float* eout, const float* e, const float* y, MatrixDim d, int e_stride, int y_stride) { _diff_tanh<<>>(eout, e, y, d, e_stride, y_stride); } +void cudaF_heaviside (dim3 Gr, dim3 Bl, float* y, const float* x, MatrixDim d, int src_stride) { + _heaviside<<>>(y, x, d, src_stride); +} + void cudaF_softmax_reduce (size_t Gr, size_t Bl, float* y, const float* x, MatrixDim d, int src_stride) { _softmax_reduce<<>>(y, x, d, src_stride); } @@ -2493,7 +2596,7 @@ void cudaF_log_softmax_reduce (size_t Gr, size_t Bl, float* y, const float* x, M } void cudaF_splice(dim3 Gr, dim3 Bl, float* y, const float* x, const int32_cuda* off, MatrixDim d_out, MatrixDim d_in) { - _splice<<>>(y,x,off,d_out,d_in); + _splice<<>>(y,x,off,d_out,d_in); } void cudaF_one(int Gr, int Bl, float* x, int dim) { @@ -2517,20 +2620,20 @@ void cudaF_copy_from_sp(dim3 Gr, dim3 Bl, const float* x, float* y, MatrixDim di } void cudaF_copy(dim3 Gr, dim3 Bl, float* y, const float* x, const int32_cuda* copy_from, MatrixDim d_out, MatrixDim d_in) { - _copy<<>>(y,x,copy_from,d_out,d_in); + _copy<<>>(y,x,copy_from,d_out,d_in); } - -void cudaF_randomize(dim3 Gr, dim3 Bl, float* y, const float* x, const int32_cuda* copy_from, MatrixDim d_out, MatrixDim d_in) { - _randomize<<>>(y,x,copy_from,d_out,d_in); + +void cudaF_randomize(dim3 Gr, dim3 Bl, float* y, const float* x, const int32_cuda* copy_from, MatrixDim d_out, MatrixDim d_in) { + _randomize<<>>(y,x,copy_from,d_out,d_in); } void cudaF_regularize_l1(dim3 Gr, dim3 Bl, float* wei, float* grad, float l1, float lr, MatrixDim d, int stride_grad) { - _regularize_l1<<>>(wei,grad,l1,lr,d,stride_grad); + _regularize_l1<<>>(wei,grad,l1,lr,d,stride_grad); } -void cudaF_find_row_max_id(dim3 Gr, dim3 Bl, const float* mat, float* vec_val, int32_cuda* vec_id, int32_cuda voff, MatrixDim d) { - _find_row_max_id<<>>(mat, vec_val, vec_id, voff, d); +void cudaF_find_row_max_id(dim3 Gr, dim3 Bl, const float* mat, float* vec_val, int32_cuda* vec_id, MatrixDim d) { + _find_row_max_id<<>>(mat, vec_val, vec_id, d); } void cudaF_diff_xent(dim3 Gr, dim3 Bl, const int32_cuda* vec_tgt, float* mat_net_out, float* vec_log_post, MatrixDim d) { @@ -2541,10 +2644,6 @@ void cudaF_copy_rows_from_vec(dim3 Gr, dim3 Bl, float *mat_out, MatrixDim d_out, _copy_rows_from_vec<<>>(mat_out, d_out, v_in); } -void cudaF_copy_col_from_mat(int Gr, int Bl, float* v, int col, const float* mat, MatrixDim dmat, int dim) { - _copy_col_from_mat<<>>(v,col,mat,dmat,dim); -} - void cudaF_copy_col_from_mat_df(int Gr, int Bl, double* v, int col, const float* mat, MatrixDim dmat, int dim) { _copy_col_from_mat_df<<>>(v,col,mat,dmat,dim); } @@ -2578,7 +2677,7 @@ void cudaF_equal_element_mask(dim3 Gr, dim3 Bl, const float *mat1, } /* - * "double" + * "double" */ /* @@ -2607,11 +2706,6 @@ void cudaDF_copy_from_tp(dim3 Gr, dim3 Bl, double* A, const float* B, MatrixDim _copy_from_tp<<>>(A,B,dmat); } - -void cudaD_copy_col_from_vec(int Gr, int Bl, double* mat, const double* v, int col, MatrixDim d) { - _copy_col_from_vec<<>>(mat,v,col,d); -} - void cudaD_transpose_matrix(dim3 Gr, dim3 Bl, double* mat, MatrixDim d) { _transpose_matrix<<>>(mat, d); } @@ -2685,7 +2779,7 @@ void cudaD_add_diag_packed(int Gr, int Bl, double* mat, double value, int dim) { } void cudaD_set_const(dim3 Gr, dim3 Bl, double* mat, double value, MatrixDim d) { - _set_const<<>>(mat,value,d); + _set_const<<>>(mat,value,d); } void cudaD_set_zero_above_diag(dim3 Gr, dim3 Bl, double* mat, MatrixDim d) { @@ -2693,7 +2787,7 @@ void cudaD_set_zero_above_diag(dim3 Gr, dim3 Bl, double* mat, MatrixDim d) { } void cudaD_add(dim3 Gr, dim3 Bl, double* mat, double value, MatrixDim d) { - _add<<>>(mat,value,d); + _add<<>>(mat,value,d); } void cudaD_scale_diag_packed(int Gr, int Bl, double* mat, double value, int dim) { @@ -2701,46 +2795,46 @@ void cudaD_scale_diag_packed(int Gr, int Bl, double* mat, double value, int dim) } void cudaD_scale(dim3 Gr, dim3 Bl, double* mat, double value, MatrixDim d) { - _scale<<>>(mat,value,d); + _scale<<>>(mat,value,d); } void cudaD_apply_log(dim3 Gr, dim3 Bl, double* mat, MatrixDim d) { - _apply_log<<>>(mat,d); + _apply_log<<>>(mat,d); } void cudaD_mul_elements(dim3 Gr, dim3 Bl, double* mat, const double* A, MatrixDim dst_d, int src_stride) { - _mul_elements<<>>(mat,A,dst_d,src_stride); + _mul_elements<<>>(mat,A,dst_d,src_stride); } void cudaD_div_elements(dim3 Gr, dim3 Bl, double* mat, const double* A, MatrixDim dst_d, int src_stride) { - _div_elements<<>>(mat,A,dst_d,src_stride); + _div_elements<<>>(mat,A,dst_d,src_stride); } void cudaD_max(dim3 Gr, dim3 Bl, double* mat, const double* A, MatrixDim dst_d, int src_stride) { - _max<<>>(mat,A,dst_d,src_stride); + _max<<>>(mat,A,dst_d,src_stride); } void cudaD_mul_cols_vec(dim3 Gr, dim3 Bl, double* mat, const double* scale, MatrixDim d) { - _mul_cols_vec<<>>(mat,scale,d); + _mul_cols_vec<<>>(mat,scale,d); } void cudaD_mul_rows_vec(dim3 Gr, dim3 Bl, double* mat, const double* scale, MatrixDim d) { _mul_rows_vec<<>>(mat,scale,d); } -void cudaD_mul_rows_group_mat(dim3 Gr, dim3 Bl, double* y, const double* x, +void cudaD_mul_rows_group_mat(dim3 Gr, dim3 Bl, double* y, const double* x, MatrixDim d, int src_stride, int group_size) { _mul_rows_group_mat<<>>(y, x, d, src_stride, group_size); } -void cudaD_calc_pnorm_deriv(dim3 Gr, dim3 Bl, double*y, const double* x1, - const double* x2, MatrixDim d, int src_stride, +void cudaD_calc_pnorm_deriv(dim3 Gr, dim3 Bl, double*y, const double* x1, + const double* x2, MatrixDim d, int src_stride, int group_size, double power) { _calc_pnorm_deriv<<>>(y, x1, x2, d, src_stride, group_size, power); } -void cudaD_calc_group_max_deriv(dim3 Gr, dim3 Bl, double*y, const double* x1, - const double* x2, MatrixDim d, int src_stride, +void cudaD_calc_group_max_deriv(dim3 Gr, dim3 Bl, double*y, const double* x1, + const double* x2, MatrixDim d, int src_stride, int group_size) { _calc_group_max_deriv<<>>(y, x1, x2, d, src_stride, group_size); } @@ -2753,7 +2847,15 @@ void cudaD_add_mat(dim3 Gr, dim3 Bl, double alpha, const double* src, double* ds if (A_trans) { _add_mat_trans<<>>(alpha,src,dst,d,src_stride); } else { - _add_mat<<>>(alpha,src,dst,d,src_stride); + _add_mat<<>>(alpha,src,dst,d,src_stride); + } +} + +void cudaD_add_mat_blocks(dim3 Gr, dim3 Bl, double alpha, const double* src, int32_cuda num_row_blocks, int32_cuda num_col_blocks, double* dst, MatrixDim d, int src_stride, int A_trans) { + if (A_trans) { + _add_mat_blocks_trans<<>>(alpha, src, num_row_blocks, num_col_blocks, dst, d, src_stride); + } else { + _add_mat_blocks<<>>(alpha, src, num_row_blocks, num_col_blocks, dst, d, src_stride); } } @@ -2767,11 +2869,11 @@ void cudaD_sy_add_tr2(dim3 Gr, dim3 Bl, double alpha, double beta, const double* } void cudaD_add_vec_to_cols(dim3 Gr, dim3 Bl, double alpha, const double* col, double beta, double* dst, MatrixDim d) { - _add_vec_to_cols<<>>(alpha,col,beta,dst,d); + _add_vec_to_cols<<>>(alpha,col,beta,dst,d); } void cudaD_add_vec_to_rows(dim3 Gr, dim3 Bl, double alpha, const double* row, double beta, double* dst, MatrixDim d) { - _add_vec_to_rows<<>>(alpha,row,beta,dst,d); + _add_vec_to_rows<<>>(alpha,row,beta,dst,d); } void cudaD_add_mat_diag_vec(dim3 Gr, dim3 Bl, double alpha, double *mat, MatrixDim mat_dim, const double *mat2, int mat2_row_stride, int mat2_col_stride, const double *vec, double beta) { @@ -2784,7 +2886,7 @@ void cudaD_add_mat_mat_elements(dim3 Gr, dim3 Bl, double *data, const double *sr // CURRENTLY UNUSED... void cudaD_apply_mask(dim3 Gr, dim3 Bl, double* mat, const char* mask, MatrixDim dmat, MatrixDim dmask) { - _apply_mask<<>>(mat,mask,dmat,dmask); + _apply_mask<<>>(mat,mask,dmat,dmask); } @@ -2820,16 +2922,16 @@ void cudaD_vec_max(const double* v, double* value, int dim) { _vec_max<<<1,CU1DBLOCK>>>(v, value, dim); } -void cudaD_trace_mat_mat_trans(const double* A, const double* B, MatrixDim dA, int B_stride, double* value) { - _trace_mat_mat_trans <<<4,CU1DBLOCK>>>(A,B,dA,B_stride,value); +void cudaD_trace_mat_mat_trans(dim3 Gr, dim3 Bl, const double* A, const double* B, MatrixDim dA, int B_stride, double* value) { + _trace_mat_mat_trans<<>>(A,B,dA,B_stride,value); } -void cudaD_trace_mat_mat(const double* A, const double* B, MatrixDim dA, int B_stride, double* value) { - _trace_mat_mat <<<2,CU1DBLOCK>>>(A,B,dA,B_stride,value); +void cudaD_trace_mat_mat(dim3 Gr, dim3 Bl, const double* A, const double* B, MatrixDim dA, int B_stride, double* value) { + _trace_mat_mat<32><<>>(A,B,dA,B_stride,value); } -void cudaD_add_diag_mat_mat(int Gr, int Bl, double alpha, double* v, int v_dim, const double* M, - int M_cols, int M_row_stride, int M_col_stride, const double *N, int N_row_stride, +void cudaD_add_diag_mat_mat(int Gr, int Bl, double alpha, double* v, int v_dim, const double* M, + int M_cols, int M_row_stride, int M_col_stride, const double *N, int N_row_stride, int N_col_stride, int threads_per_element, double beta) { _add_diag_mat_mat<<>>(alpha, v, v_dim, M, M_cols, M_row_stride, M_col_stride, N, N_row_stride, N_col_stride, threads_per_element, beta); @@ -2839,10 +2941,6 @@ void cudaD_add_vec_vec(int Gr, int Bl, double alpha, double* v, const double* x, _add_vec_vec<<>>(alpha,v,x,y,beta,dim); } -void cudaD_copy_col_from_mat(int Gr, int Bl, double* v, int col, const double* mat, MatrixDim dmat, int dim) { - _copy_col_from_mat<<>>(v,col,mat,dmat,dim); -} - void cudaD_copy_col_from_mat_df(int Gr, int Bl, double* v, int col, const double* mat, MatrixDim dmat, int dim) { _copy_col_from_mat_df<<>>(v,col,mat,dmat,dim); } @@ -2859,8 +2957,12 @@ void cudaD_pvec_sum(int Gr, int Bl, double* v, double* pvec_sum, int dim, int si _pvec_sum<<>>(v,pvec_sum,dim,size); } -void cudaD_matrix_add_elements(dim3 Gr, dim3 Bl, double *data, MatrixDim dim, double alpha, MatrixElement* x, int s) { - _cuda_matrix_add_elements<<>>(data, dim, alpha, x, s); +void cudaD_matrix_add_elements(dim3 Gr, dim3 Bl, double *data, MatrixDim dim, double alpha, MatrixElement* x, int num_elements) { + _cuda_matrix_add_elements<<>>(data, dim, alpha, x, num_elements); +} + +void cudaD_matrix_add_indexed_values(dim3 Gr, dim3 Bl, MatrixDim dim, double alpha, const Int32Pair* indices, const double* x, int s, double* data) { + _cuda_matrix_add_indexed_values<<>>(dim, alpha, indices, x, s, data); } void cudaD_vec_copy_diag_from_packed(int Gr, int Bl, double *dst, const double *src, int dim) { @@ -2871,6 +2973,10 @@ void cudaD_vec_apply_floor(int Gr, int Bl, double* v, double floor_val, float *c _vec_apply_floor<<>>(v,floor_val,count,dim); } +void cudaD_vec_apply_ceiling(int Gr, int Bl, double* v, double ceiling_val, float *count, int dim) { + _vec_apply_ceiling<<>>(v,ceiling_val,count,dim); +} + void cudaD_vec_apply_exp(int Gr, int Bl, double* v, int dim) { _vec_apply_exp<<>>(v,dim); } @@ -2911,21 +3017,21 @@ void cudaD_block_add_mat_mat(dim3 Gr, dim3 Bl, CuBlockMatrixData *B_cu_data, int * cu:: */ void cudaD_soft_hinge (dim3 Gr, dim3 Bl, double* y, const double* x, MatrixDim d, int src_stride) { - _soft_hinge<<>>(y, x, d, src_stride); + _soft_hinge<<>>(y, x, d, src_stride); } -void cudaD_group_pnorm(dim3 Gr, dim3 Bl, double* y, const double* x, MatrixDim d, +void cudaD_group_pnorm(dim3 Gr, dim3 Bl, double* y, const double* x, MatrixDim d, int src_stride, int group_size, double power) { _group_pnorm<<>>(y, x, d, src_stride, group_size, power); } -void cudaD_group_max(dim3 Gr, dim3 Bl, double* y, const double* x, MatrixDim d, +void cudaD_group_max(dim3 Gr, dim3 Bl, double* y, const double* x, MatrixDim d, int src_stride, int group_size) { _group_max<<>>(y, x, d, src_stride, group_size); } void cudaD_sigmoid (dim3 Gr, dim3 Bl, double* y, const double* x, MatrixDim d, int src_stride) { - _sigmoid<<>>(y, x, d, src_stride); + _sigmoid<<>>(y, x, d, src_stride); } void cudaD_diff_sigmoid (dim3 Gr, dim3 Bl, double* eout, const double* e, const double* y, MatrixDim d, int e_stride, int y_stride) { @@ -2933,13 +3039,17 @@ void cudaD_diff_sigmoid (dim3 Gr, dim3 Bl, double* eout, const double* e, const } void cudaD_tanh (dim3 Gr, dim3 Bl, double* y, const double* x, MatrixDim d, int src_stride) { - _tanh<<>>(y, x, d, src_stride); + _tanh<<>>(y, x, d, src_stride); } void cudaD_diff_tanh (dim3 Gr, dim3 Bl, double* eout, const double* e, const double* y, MatrixDim d, int e_stride, int y_stride) { _diff_tanh<<>>(eout, e, y, d, e_stride, y_stride); } +void cudaD_heaviside (dim3 Gr, dim3 Bl, double* y, const double* x, MatrixDim d, int src_stride) { + _heaviside<<>>(y, x, d, src_stride); +} + void cudaD_softmax_reduce (size_t Gr, size_t Bl, double* y, const double* x, MatrixDim d, int src_stride) { _softmax_reduce<<>>(y, x, d, src_stride); } @@ -2949,7 +3059,7 @@ void cudaD_log_softmax_reduce (size_t Gr, size_t Bl, double* y, const double* x, } void cudaD_splice(dim3 Gr, dim3 Bl, double* y, const double* x, const int32_cuda* off, MatrixDim d_out, MatrixDim d_in) { - _splice<<>>(y,x,off,d_out,d_in); + _splice<<>>(y,x,off,d_out,d_in); } void cudaD_one(int Gr, int Bl, double* x, int dim) { @@ -2973,19 +3083,19 @@ void cudaD_copy_from_sp(dim3 Gr, dim3 Bl, const double* x, double* y, MatrixDim } void cudaD_copy(dim3 Gr, dim3 Bl, double* y, const double* x, const int32_cuda* copy_from, MatrixDim d_out, MatrixDim d_in) { - _copy<<>>(y,x,copy_from,d_out,d_in); + _copy<<>>(y,x,copy_from,d_out,d_in); } - -void cudaD_randomize(dim3 Gr, dim3 Bl, double* y, const double* x, const int32_cuda* copy_from, MatrixDim d_out, MatrixDim d_in) { - _randomize<<>>(y,x,copy_from,d_out,d_in); + +void cudaD_randomize(dim3 Gr, dim3 Bl, double* y, const double* x, const int32_cuda* copy_from, MatrixDim d_out, MatrixDim d_in) { + _randomize<<>>(y,x,copy_from,d_out,d_in); } void cudaD_regularize_l1(dim3 Gr, dim3 Bl, double* wei, double* grad, double l1, double lr, MatrixDim d,int stride_grad) { - _regularize_l1<<>>(wei,grad,l1,lr,d,stride_grad); + _regularize_l1<<>>(wei,grad,l1,lr,d,stride_grad); } -void cudaD_find_row_max_id(dim3 Gr, dim3 Bl, const double* mat, double* vec_val, int32_cuda* vec_id, int32_cuda voff, MatrixDim d) { - _find_row_max_id<<>>(mat, vec_val, vec_id, voff, d); +void cudaD_find_row_max_id(dim3 Gr, dim3 Bl, const double* mat, double* vec_val, int32_cuda* vec_id, MatrixDim d) { + _find_row_max_id<<>>(mat, vec_val, vec_id, d); } void cudaD_diff_xent(dim3 Gr, dim3 Bl, const int32_cuda* vec_tgt, double* mat_net_out, double* vec_log_post, MatrixDim d) { @@ -3041,19 +3151,19 @@ void cuda_copy_from_mat_dd(dim3 Gr, dim3 Bl, double *mat_out, const double* mat_ } void cuda_copy_from_mat_df_trans(dim3 Gr, dim3 Bl, double* mat_out, const float* mat_in, MatrixDim d_out, MatrixDim d_in) { - _copy_from_mat_trans<<>>(mat_out,mat_in,d_out,d_in); + _copy_from_mat_trans<32><<>>(mat_out,mat_in,d_out,d_in); } -void cuda_copy_from_mat_ff_trans(dim3 Gr, dim3 Bl, float* mat_out, const float* mat_in, MatrixDim d_out, MatrixDim d_in) { - _copy_from_mat_trans<<>>(mat_out,mat_in,d_out,d_in); +void cuda_copy_from_mat_ff_trans(dim3 Gr, dim3 Bl, float* mat_out, const float* mat_in, MatrixDim d_out, MatrixDim d_in) { + _copy_from_mat_trans<32><<>>(mat_out,mat_in,d_out,d_in); } void cuda_copy_from_mat_fd_trans(dim3 Gr, dim3 Bl, float *mat_out, const double* mat_in, MatrixDim d_out, MatrixDim d_in) { - _copy_from_mat_trans<<>>(mat_out,mat_in,d_out,d_in); + _copy_from_mat_trans<32><<>>(mat_out,mat_in,d_out,d_in); } void cuda_copy_from_mat_dd_trans(dim3 Gr, dim3 Bl, double *mat_out, const double* mat_in, MatrixDim d_out, MatrixDim d_in) { - _copy_from_mat_trans<<>>(mat_out,mat_in,d_out,d_in); + _copy_from_mat_trans<32><<>>(mat_out,mat_in,d_out,d_in); } void cuda_copy_from_smat_ff(dim3 Gr, dim3 Bl, float* mat_out, const MatrixElement* smat_in, MatrixDim d_out, MatrixIndexT_cuda d_in) { @@ -3081,19 +3191,6 @@ void cuda_copy_from_smat_dd_trans(dim3 Gr, dim3 Bl, double* mat_out, const Matri _copy_from_smat_trans<<>>(mat_out, smat_in, d_out, d_in); } -void cuda_copy_from_smat_as_vec_ff(dim3 Gr, dim3 Bl, float* vec_out, const MatrixElement* smat_in, MatrixIndexT_cuda d_in) { - _copy_from_smat_as_vec<<>>(vec_out, smat_in, d_in); -} -void cuda_copy_from_smat_as_vec_fd(dim3 Gr, dim3 Bl, float* vec_out, const MatrixElement* smat_in, MatrixIndexT_cuda d_in) { - _copy_from_smat_as_vec<<>>(vec_out, smat_in, d_in); -} -void cuda_copy_from_smat_as_vec_df(dim3 Gr, dim3 Bl, double* vec_out, const MatrixElement* smat_in, MatrixIndexT_cuda d_in) { - _copy_from_smat_as_vec<<>>(vec_out, smat_in, d_in); -} -void cuda_copy_from_smat_as_vec_dd(dim3 Gr, dim3 Bl, double* vec_out, const MatrixElement* smat_in, MatrixIndexT_cuda d_in) { - _copy_from_smat_as_vec<<>>(vec_out, smat_in, d_in); -} - void cudaF_trace_mat_smat(dim3 Gr, dim3 Bl, const float* mat_in, const MatrixElement* smat_in, MatrixDim mat_d_in, MatrixIndexT_cuda smat_d_in, float* trace_vec_out) { _trace_mat_smat<<>>(mat_in, smat_in, mat_d_in, smat_d_in, trace_vec_out); } diff --git a/src/cudamatrix/cu-kernels.h b/src/cudamatrix/cu-kernels.h index 9464f9e261a..342f2705e74 100644 --- a/src/cudamatrix/cu-kernels.h +++ b/src/cudamatrix/cu-kernels.h @@ -4,7 +4,7 @@ // 2013 Ehsan Variani // 2014 Johns Hopkins University (author: Daniel Povey) // 2013 Hainan Xu -// 2013 Xiaohui Zhang +// 2013 Xiaohui Zhang // 2013-2015 Guoguo Chen // See ../../COPYING for clarification regarding multiple authors @@ -33,14 +33,14 @@ #include "cudamatrix/cu-kernels-ansi.h" /* - * In this file are C++ templated wrappers + * In this file are C++ templated wrappers * of the ANSI-C CUDA kernels */ namespace kaldi { /* - * CuMatrix + * CuMatrix */ inline void cuda_copy_upp_low(dim3 Gr, dim3 Bl, float* A, MatrixDim dimA) { cudaF_copy_upp_low(Gr, Bl, A, dimA); } @@ -108,19 +108,6 @@ inline void cuda_copy_from_smat_trans(dim3 Gr, dim3 Bl, double* mat_out, const M cuda_copy_from_smat_dd_trans(Gr, Bl, mat_out, smat_in, d_out, d_in); } -inline void cuda_copy_from_smat_as_vec(dim3 Gr, dim3 Bl, float* vec_out, const MatrixElement* smat_in, MatrixIndexT_cuda d_in) { - cuda_copy_from_smat_as_vec_ff(Gr, Bl, vec_out, smat_in, d_in); -} -inline void cuda_copy_from_smat_as_vec(dim3 Gr, dim3 Bl, float* vec_out, const MatrixElement* smat_in, MatrixIndexT_cuda d_in) { - cuda_copy_from_smat_as_vec_fd(Gr, Bl, vec_out, smat_in, d_in); -} -inline void cuda_copy_from_smat_as_vec(dim3 Gr, dim3 Bl, double* vec_out, const MatrixElement* smat_in, MatrixIndexT_cuda d_in) { - cuda_copy_from_smat_as_vec_df(Gr, Bl, vec_out, smat_in, d_in); -} -inline void cuda_copy_from_smat_as_vec(dim3 Gr, dim3 Bl, double* vec_out, const MatrixElement* smat_in, MatrixIndexT_cuda d_in) { - cuda_copy_from_smat_as_vec_dd(Gr, Bl, vec_out, smat_in, d_in); -} - inline void cuda_trace_mat_smat(dim3 Gr, dim3 Bl, const float* mat_in, const MatrixElement* smat_in, MatrixDim mat_d_in, MatrixIndexT_cuda smat_d_in, float* trace_vec_out) { cudaF_trace_mat_smat(Gr, Bl, mat_in, smat_in, mat_d_in, smat_d_in, trace_vec_out); } @@ -134,7 +121,6 @@ inline void cuda_trace_mat_smat_trans(dim3 Gr, dim3 Bl, const double* mat_in, co cudaD_trace_mat_smat_trans(Gr, Bl, mat_in, smat_in, mat_d_in, smat_d_in, trace_vec_out); } -inline void cuda_copy_col_from_vec(int Gr, int Bl, float* mat, const float* v, int col, MatrixDim d) { cudaF_copy_col_from_vec(Gr,Bl,mat,v,col,d); } inline void cuda_apply_exp(dim3 Gr, dim3 Bl, float* mat, MatrixDim d) { cudaF_apply_exp(Gr,Bl,mat,d); } inline void cuda_apply_pow(dim3 Gr, dim3 Bl, float* mat, float power, MatrixDim dim) { cudaF_apply_pow(Gr,Bl,mat,power,dim); } inline void cuda_apply_pow_abs(dim3 Gr, dim3 Bl, float* mat, float power, bool include_sign, MatrixDim dim) { cudaF_apply_pow_abs(Gr,Bl,mat,power,include_sign, dim); } @@ -183,16 +169,17 @@ inline void cuda_mul_rows_group_mat(dim3 Gr, dim3 Bl, float *y, const float *x, inline void cuda_calc_pnorm_deriv(dim3 Gr, dim3 Bl, float *y, const float *x1, const float *x2, MatrixDim d, int src_stride, int group_size, float power) {cudaF_calc_pnorm_deriv(Gr, Bl, y, x1, x2, d, src_stride, group_size, power); } inline void cuda_calc_group_max_deriv(dim3 Gr, dim3 Bl, float *y, const float *x1, const float *x2, MatrixDim d, int src_stride, int group_size) {cudaF_calc_group_max_deriv(Gr, Bl, y, x1, x2, d, src_stride, group_size); } inline void cuda_add_mat(dim3 Gr, dim3 Bl, float alpha, const float *src, float *dst, MatrixDim d, int src_stride, int A_trans) { cudaF_add_mat(Gr,Bl,alpha,src,dst,d,src_stride, A_trans); } +inline void cuda_add_mat_blocks(dim3 Gr, dim3 Bl, float alpha, const float *src, int32_cuda num_row_blocks, int32_cuda num_col_blocks, float *dst, MatrixDim d, int src_stride, int A_trans) { cudaF_add_mat_blocks(Gr, Bl, alpha, src, num_row_blocks, num_col_blocks, dst, d, src_stride, A_trans); } inline void cuda_add_mat_mat_div_mat(dim3 Gr, dim3 Bl, const float *A, const float *B, const float *C, float *dst, MatrixDim d, int stride_a, int stride_b, int stride_c) { cudaF_add_mat_mat_div_mat(Gr,Bl,A,B,C,dst,d,stride_a,stride_b,stride_c); } inline void cuda_add_vec_to_cols(dim3 Gr, dim3 Bl, float alpha, const float *col, float beta, float *dst, MatrixDim d) { cudaF_add_vec_to_cols(Gr,Bl,alpha,col,beta,dst,d); } inline void cuda_add_vec_to_rows(dim3 Gr, dim3 Bl, float alpha, const float *row, float beta, float *dst, MatrixDim d) { cudaF_add_vec_to_rows(Gr,Bl,alpha,row,beta,dst,d); } inline void cuda_transpose_matrix(dim3 Gr, dim3 Bl, float* mat, MatrixDim d) { cudaF_transpose_matrix(Gr, Bl, mat, d); } inline void cuda_sy_add_tr2(dim3 Gr, dim3 Bl, float alpha, float beta, const float* T, MatrixDim tdim, float *S, MatrixDim sdim) { cudaF_sy_add_tr2(Gr, Bl, alpha, beta, T, tdim, S, sdim); } inline void cuda_add_mat_diag_vec(dim3 Gr, dim3 Bl, float alpha, float *mat, MatrixDim mat_dim, const float *mat2, int mat2_row_stride, int mat2_col_stride, const float *vec, float beta) { cudaF_add_mat_diag_vec(Gr, Bl, alpha, mat, mat_dim, mat2, mat2_row_stride, mat2_col_stride, vec, beta); } -inline void cuda_add_mat_mat_elements(dim3 Gr, dim3 Bl, float *data, const float *srcA_data, const float *srcB_data, MatrixDim dim, int srcA_stride, int srcB_stride, float alpha, float beta) { cudaF_add_mat_mat_elements(Gr, Bl, data, srcA_data, srcB_data, dim, srcA_stride, srcB_stride, alpha, beta); } +inline void cuda_add_mat_mat_elements(dim3 Gr, dim3 Bl, float *data, const float *srcA_data, const float *srcB_data, MatrixDim dim, int srcA_stride, int srcB_stride, float alpha, float beta) { cudaF_add_mat_mat_elements(Gr, Bl, data, srcA_data, srcB_data, dim, srcA_stride, srcB_stride, alpha, beta); } + - /* * CuVector */ @@ -205,22 +192,22 @@ inline void cuda_vec_mul_elements(int Gr, int Bl, float* v, const float* a, int inline void cuda_vec_soft_max(int Gr, int Bl, float* v, int dim) { cudaF_vec_soft_max(Gr,Bl,v,dim); } inline void cuda_vec_min(const float* v, float* value, int dim) { cudaF_vec_min(v,value,dim); } inline void cuda_vec_max(const float* v, float* value, int dim) { cudaF_vec_max(v,value,dim); } -inline void cuda_trace_mat_mat_trans(const float* A, const float* B, MatrixDim dA, int B_stride, float* value) { cudaF_trace_mat_mat_trans(A,B,dA,B_stride,value); } -inline void cuda_trace_mat_mat(const float* A, const float* B, MatrixDim dA, int B_stride, float* value) { cudaF_trace_mat_mat(A,B,dA,B_stride,value); } -inline void cuda_add_diag_mat_mat(int Gr, int Bl, float alpha, float* v, int v_dim, const float* M, - int M_cols, int M_row_stride, int M_col_stride, const float *N, int N_row_stride, +inline void cuda_trace_mat_mat_trans(dim3 Gr, dim3 Bl, const float* A, const float* B, MatrixDim dA, int B_stride, float* value) { cudaF_trace_mat_mat_trans(Gr,Bl,A,B,dA,B_stride,value); } +inline void cuda_trace_mat_mat(dim3 Gr, dim3 Bl, const float* A, const float* B, MatrixDim dA, int B_stride, float* value) { cudaF_trace_mat_mat(Gr,Bl,A,B,dA,B_stride,value); } +inline void cuda_add_diag_mat_mat(int Gr, int Bl, float alpha, float* v, int v_dim, const float* M, + int M_cols, int M_row_stride, int M_col_stride, const float *N, int N_row_stride, int N_col_stride, int threads_per_element, float beta) { cudaF_add_diag_mat_mat(Gr, Bl, alpha, v, v_dim, M, M_cols, M_row_stride, M_col_stride, N, N_row_stride, N_col_stride, threads_per_element, beta); } inline void cuda_add_vec_vec(int Gr, int Bl, float alpha, float* v, const float* x, const float* y, float beta, int dim) { cudaF_add_vec_vec(Gr,Bl,alpha,v,x,y,beta,dim); } -inline void cuda_copy_col_from_mat(int Gr, int Bl, float* v, int col, const float* mat, MatrixDim dmat, int dim) { cudaF_copy_col_from_mat(Gr,Bl,v,col,mat,dmat,dim); } inline void cuda_copy_col_from_mat_df(int Gr, int Bl, double* v, int col, const float* mat, MatrixDim dmat, int dim) { cudaF_copy_col_from_mat_df(Gr,Bl,v,col,mat,dmat,dim); } inline void cuda_copy_col_from_mat_fd(int Gr, int Bl, float* v, int col, const float* mat, MatrixDim dmat, int dim) { cudaF_copy_col_from_mat_fd(Gr,Bl,v,col,mat,dmat,dim); } inline void cuda_vec_sum(int Gr, int Bl, float* v, float* value, int dim, int inc) { cudaF_vec_sum(Gr,Bl,v,value,dim,inc); } inline void cuda_pvec_sum(int Gr, int Bl, float* vec, float* pvec_sum, int dim, int size) { cudaF_pvec_sum(Gr, Bl, vec, pvec_sum, dim, size); } inline void cuda_vec_copy_diag_from_packed(int Gr, int Bl, float *dst, const float *src, int dim) { cudaF_vec_copy_diag_from_packed(Gr,Bl,dst,src,dim); } inline void cuda_vec_apply_floor(int Gr, int Bl, float* v, float floor_val, float* num, int dim) { cudaF_vec_apply_floor(Gr,Bl,v,floor_val,num,dim); } +inline void cuda_vec_apply_ceiling(int Gr, int Bl, float* v, float floor_val, float* num, int dim) { cudaF_vec_apply_ceiling(Gr,Bl,v,floor_val,num,dim); } inline void cuda_vec_apply_exp(int Gr, int Bl, float* v, int dim) { cudaF_vec_apply_exp(Gr,Bl,v,dim); } inline void cuda_vec_apply_log(int Gr, int Bl, float* v, float* flag, int dim) { cudaF_vec_apply_log(Gr,Bl,v,flag,dim); } inline void cuda_invert_elements(dim3 Gr, dim3 Bl, float *data, MatrixDim d) { cudaF_invert_elements(Gr,Bl,data,d); } @@ -253,6 +240,7 @@ inline void cuda_sigmoid(dim3 Gr, dim3 Bl, float *y, const float *x, MatrixDim d inline void cuda_diff_sigmoid(dim3 Gr, dim3 Bl, float *eout, const float *e, const float *y, MatrixDim d, int e_stride, int y_stride) { cudaF_diff_sigmoid(Gr,Bl,eout,e,y,d,e_stride,y_stride); } inline void cuda_tanh(dim3 Gr, dim3 Bl, float *y, const float *x, MatrixDim d, int src_stride) { cudaF_tanh(Gr,Bl,y,x,d,src_stride); } inline void cuda_diff_tanh(dim3 Gr, dim3 Bl, float *eout, const float *e, const float *y, MatrixDim d, int e_stride, int y_stride) { cudaF_diff_tanh(Gr,Bl,eout,e,y,d,e_stride,y_stride); } +inline void cuda_heaviside(dim3 Gr, dim3 Bl, float *y, const float *x, MatrixDim d, int src_stride) { cudaF_heaviside(Gr,Bl,y,x,d,src_stride); } /* Bl: dimBlock value is fixed min(d.col, CU1DBLOCK), represent CU1DBLOCK threads reduce a row at the same time. Gr: the number of rows @@ -261,7 +249,7 @@ inline void cuda_softmax_reduce(size_t Gr, size_t Bl, float *y, const float *x, inline void cuda_log_softmax_reduce(size_t Gr, size_t Bl, float *y, const float *x, MatrixDim d, int src_stride) { cudaF_log_softmax_reduce(Gr,Bl,y,x,d,src_stride); } inline void cuda_regularize_l1(dim3 Gr, dim3 Bl, float *wei, float *grad, float l1, float lr, MatrixDim d, int stride_grad) { cudaF_regularize_l1(Gr,Bl,wei,grad,l1,lr,d,stride_grad); } -inline void cuda_find_row_max_id(dim3 Gr, dim3 Bl, const float *mat, float *vec_val, int32_cuda *vec_id, int32_cuda voff, MatrixDim d) { cudaF_find_row_max_id(Gr,Bl,mat,vec_val,vec_id,voff,d); } +inline void cuda_find_row_max_id(dim3 Gr, dim3 Bl, const float *mat, float *vec_val, int32_cuda *vec_id, MatrixDim d) { cudaF_find_row_max_id(Gr,Bl,mat,vec_val,vec_id,d); } inline void cuda_diff_xent(dim3 Gr, dim3 Bl, const int32_cuda *vec_tgt, float *mat_net_out, float *vec_log_post, MatrixDim d) { cudaF_diff_xent(Gr,Bl,vec_tgt,mat_net_out,vec_log_post,d); } inline void cuda_copy_rows_from_vec(dim3 Gr, dim3 Bl, float *mat_out, MatrixDim d_out, const float *v_in) { cudaF_copy_rows_from_vec(Gr, Bl, mat_out, d_out, v_in); @@ -277,7 +265,8 @@ inline void cuda_copy_from_sp(dim3 Gr, dim3 Bl, const float* x, float* y, Matrix inline void cuda_take_lower(dim3 Gr, dim3 Bl, const float* x, float* y, MatrixDim d_in) { cudaF_take_lower(Gr,Bl,x,y,d_in); } inline void cuda_take_upper(dim3 Gr, dim3 Bl, const float* x, float* y, MatrixDim d_in) { cudaF_take_upper(Gr,Bl,x,y,d_in); } inline void cuda_take_mean(dim3 Gr, dim3 Bl, const float* x, float* y, MatrixDim d_in) { cudaF_take_mean(Gr,Bl,x,y,d_in); } -inline void cuda_matrix_add_elements(dim3 Gr, dim3 Bl, float *data, MatrixDim dim, float alpha, MatrixElement* x, int s) { cudaF_matrix_add_elements(Gr, Bl, data, dim, alpha, x, s); } +inline void cuda_matrix_add_elements(dim3 Gr, dim3 Bl, float *data, MatrixDim dim, float alpha, MatrixElement* x, int num_elements) { cudaF_matrix_add_elements(Gr, Bl, data, dim, alpha, x, num_elements); } +inline void cuda_matrix_add_indexed_values(dim3 Gr, dim3 Bl, MatrixDim dim, float alpha, const Int32Pair* indices, const float* x, int s, float* data) { cudaF_matrix_add_indexed_values(Gr, Bl, dim, alpha, indices, x, s, data); } inline void cuda_comp_obj_deriv(dim3 Gr, dim3 Bl, MatrixElement* x, int32 size, const float* z, MatrixDim d, float* z2, MatrixDim d2, float* t) {cudaF_comp_obj_deriv(Gr,Bl,x,size,z,d,z2,d2,t); } inline void cuda_sum_column_ranges(dim3 Gr, dim3 Bl, float *data, MatrixDim dim, const float *src_data, MatrixDim src_dim, @@ -295,7 +284,7 @@ inline void cuda_matrix_lookup(dim3 Gr, dim3 Bl, const float *data, cudaF_matrix_lookup(Gr, Bl, data, dim, indices, indices_size, output); } -inline void cuda_equal_element_mask(dim3 Gr, dim3 Bl, const float *mat1, const float *mat2, float *mask, +inline void cuda_equal_element_mask(dim3 Gr, dim3 Bl, const float *mat1, const float *mat2, float *mask, MatrixDim mat1_dim, int mat2_stride, int mask_stride) { cudaF_equal_element_mask(Gr, Bl, mat1, mat2, mask, mat1_dim, mat2_stride, mask_stride); } @@ -305,7 +294,7 @@ inline void cuda_equal_element_mask(dim3 Gr, dim3 Bl, const float *mat1, const f // double versions /* - * CuMatrix + * CuMatrix */ inline void cuda_copy_upp_low(dim3 Gr, dim3 Bl, double* A, MatrixDim dimA) { cudaD_copy_upp_low(Gr, Bl, A, dimA); } inline void cuda_copy_low_upp(dim3 Gr, dim3 Bl, double* A, MatrixDim dimA) { cudaD_copy_low_upp(Gr, Bl, A, dimA); } @@ -319,7 +308,6 @@ inline void cuda_copy_from_tp_trans(dim3 Gr, dim3 Bl, double* A, const double* B inline void cuda_copy_from_tp_trans(dim3 Gr, dim3 Bl, double* A, const float* B, MatrixDim dmat) { cudaDF_copy_from_tp_trans(Gr,Bl,A,B,dmat); } inline void cuda_copy_from_tp(dim3 Gr, dim3 Bl, double* A, const double* B, MatrixDim dmat) { cudaD_copy_from_tp(Gr,Bl,A,B,dmat); } inline void cuda_copy_from_tp(dim3 Gr, dim3 Bl, double* A, const float* B, MatrixDim dmat) { cudaDF_copy_from_tp(Gr,Bl,A,B,dmat); } -inline void cuda_copy_col_from_vec(int Gr, int Bl, double* mat, const double* v, int col, MatrixDim d) { cudaD_copy_col_from_vec(Gr,Bl,mat,v,col,d); } inline void cuda_apply_exp(dim3 Gr, dim3 Bl, double* mat, MatrixDim d) { cudaD_apply_exp(Gr,Bl,mat,d); } inline void cuda_apply_pow(dim3 Gr, dim3 Bl, double* mat, double power, MatrixDim dim) { cudaD_apply_pow(Gr,Bl,mat,power,dim); } inline void cuda_apply_pow_abs(dim3 Gr, dim3 Bl, double* mat, double power, bool include_sign, MatrixDim dim) { cudaD_apply_pow_abs(Gr,Bl,mat,power,include_sign,dim); } @@ -368,6 +356,7 @@ inline void cuda_mul_rows_group_mat(dim3 Gr, dim3 Bl, double *y, const double *x inline void cuda_calc_pnorm_deriv(dim3 Gr, dim3 Bl, double *y, const double *x1, const double *x2, MatrixDim d, int src_stride, int group_size, double power) {cudaD_calc_pnorm_deriv(Gr, Bl, y, x1, x2, d, src_stride, group_size, power); } inline void cuda_calc_group_max_deriv(dim3 Gr, dim3 Bl, double *y, const double *x1, const double *x2, MatrixDim d, int src_stride, int group_size) {cudaD_calc_group_max_deriv(Gr, Bl, y, x1, x2, d, src_stride, group_size); } inline void cuda_add_mat(dim3 Gr, dim3 Bl, double alpha, const double *src, double *dst, MatrixDim d, int src_stride, int A_trans) { cudaD_add_mat(Gr,Bl,alpha,src,dst,d,src_stride, A_trans); } +inline void cuda_add_mat_blocks(dim3 Gr, dim3 Bl, double alpha, const double *src, int32_cuda num_row_blocks, int32_cuda num_col_blocks, double *dst, MatrixDim d, int src_stride, int A_trans) { cudaD_add_mat_blocks(Gr, Bl, alpha, src, num_row_blocks, num_col_blocks, dst, d, src_stride, A_trans); } inline void cuda_add_mat_mat_div_mat(dim3 Gr, dim3 Bl, const double *A, const double *B, const double *C, double *dst, MatrixDim d, int stride_a, int stride_b, int stride_c) { cudaD_add_mat_mat_div_mat(Gr,Bl,A,B,C,dst,d,stride_a,stride_b,stride_c); } inline void cuda_add_vec_to_cols(dim3 Gr, dim3 Bl, double alpha, const double *col, double beta, double *dst, MatrixDim d) { cudaD_add_vec_to_cols(Gr,Bl,alpha,col,beta,dst,d); } inline void cuda_add_vec_to_rows(dim3 Gr, dim3 Bl, double alpha, const double *row, double beta, double *dst, MatrixDim d) { cudaD_add_vec_to_rows(Gr,Bl,alpha,row,beta,dst,d); } @@ -388,22 +377,22 @@ inline void cuda_vec_mul_elements(int Gr, int Bl, double* v, const double* a, in inline void cuda_vec_soft_max(int Gr, int Bl, double* v, int dim) { cudaD_vec_soft_max(Gr,Bl,v,dim); } inline void cuda_vec_min(const double* v, double* value, int dim) { cudaD_vec_min(v,value,dim); } inline void cuda_vec_max(const double* v, double* value, int dim) { cudaD_vec_max(v,value,dim); } -inline void cuda_trace_mat_mat_trans(const double* A, const double* B, MatrixDim dA, int B_stride, double* value) { cudaD_trace_mat_mat_trans(A,B,dA,B_stride,value); } -inline void cuda_trace_mat_mat(const double* A, const double* B, MatrixDim dA, int B_stride, double* value) { cudaD_trace_mat_mat(A,B,dA,B_stride,value); } -inline void cuda_add_diag_mat_mat(int Gr, int Bl, double alpha, double* v, int v_dim, const double* M, - int M_cols, int M_row_stride, int M_col_stride, const double *N, int N_row_stride, +inline void cuda_trace_mat_mat_trans(dim3 Gr, dim3 Bl, const double* A, const double* B, MatrixDim dA, int B_stride, double* value) { cudaD_trace_mat_mat_trans(Gr,Bl,A,B,dA,B_stride,value); } +inline void cuda_trace_mat_mat(dim3 Gr, dim3 Bl, const double* A, const double* B, MatrixDim dA, int B_stride, double* value) { cudaD_trace_mat_mat(Gr,Bl,A,B,dA,B_stride,value); } +inline void cuda_add_diag_mat_mat(int Gr, int Bl, double alpha, double* v, int v_dim, const double* M, + int M_cols, int M_row_stride, int M_col_stride, const double *N, int N_row_stride, int N_col_stride, int threads_per_element, double beta) { cudaD_add_diag_mat_mat(Gr, Bl, alpha, v, v_dim, M, M_cols, M_row_stride, M_col_stride, N, N_row_stride, N_col_stride, threads_per_element, beta); } inline void cuda_add_vec_vec(int Gr, int Bl, double alpha, double* v, const double* x, const double* y, double beta, int dim) { cudaD_add_vec_vec(Gr,Bl,alpha,v,x,y,beta,dim); } -inline void cuda_copy_col_from_mat(int Gr, int Bl, double* v, int col, const double* mat, MatrixDim dmat, int dim) { cudaD_copy_col_from_mat(Gr,Bl,v,col,mat,dmat,dim); } inline void cuda_copy_col_from_mat_df(int Gr, int Bl, double* v, int col, const double* mat, MatrixDim dmat, int dim) { cudaD_copy_col_from_mat_df(Gr,Bl,v,col,mat,dmat,dim); } inline void cuda_copy_col_from_mat_fd(int Gr, int Bl, float* v, int col, const double* mat, MatrixDim dmat, int dim) { cudaD_copy_col_from_mat_fd(Gr,Bl,v,col,mat,dmat,dim); } inline void cuda_vec_sum(int Gr, int Bl, double* v, double* value, int dim, int inc) { cudaD_vec_sum(Gr,Bl,v,value,dim,inc); } inline void cuda_pvec_sum(int Gr, int Bl, double* vec, double* pvec_sum, int dim, int size) { cudaD_pvec_sum(Gr,Bl,vec,pvec_sum,dim,size); } inline void cuda_vec_copy_diag_from_packed(int Gr, int Bl, double *dst, const double *src, int dim) { cudaD_vec_copy_diag_from_packed(Gr,Bl,dst,src,dim); } inline void cuda_vec_apply_floor(int Gr, int Bl, double* v, double floor_val, float* num, int dim) { cudaD_vec_apply_floor(Gr,Bl,v,floor_val,num,dim); } +inline void cuda_vec_apply_ceiling(int Gr, int Bl, double* v, double floor_val, float* num, int dim) { cudaD_vec_apply_ceiling(Gr,Bl,v,floor_val,num,dim); } inline void cuda_vec_apply_exp(int Gr, int Bl, double* v, int dim) { cudaD_vec_apply_exp(Gr,Bl,v,dim); } inline void cuda_vec_apply_log(int Gr, int Bl, double* v, double* flag, int dim) { cudaD_vec_apply_log(Gr,Bl,v,flag,dim); } inline void cuda_invert_elements(dim3 Gr, dim3 Bl, double *data, MatrixDim d) { cudaD_invert_elements(Gr,Bl,data,d); } @@ -434,11 +423,12 @@ inline void cuda_sigmoid(dim3 Gr, dim3 Bl, double *y, const double *x, MatrixDim inline void cuda_diff_sigmoid(dim3 Gr, dim3 Bl, double *eout, const double *e, const double *y, MatrixDim d, int e_stride, int y_stride) { cudaD_diff_sigmoid(Gr,Bl,eout,e,y,d,e_stride,y_stride); } inline void cuda_tanh(dim3 Gr, dim3 Bl, double *y, const double *x, MatrixDim d, int src_stride) { cudaD_tanh(Gr,Bl,y,x,d,src_stride); } inline void cuda_diff_tanh(dim3 Gr, dim3 Bl, double *eout, const double *e, const double *y, MatrixDim d, int e_stride, int y_stride) { cudaD_diff_tanh(Gr,Bl,eout,e,y,d,e_stride,y_stride); } +inline void cuda_heaviside(dim3 Gr, dim3 Bl, double *y, const double *x, MatrixDim d, int src_stride) { cudaD_heaviside(Gr,Bl,y,x,d,src_stride); } inline void cuda_softmax_reduce(size_t Gr, size_t Bl, double *y, const double *x, MatrixDim d, int src_stride) { cudaD_softmax_reduce(Gr,Bl,y,x,d,src_stride); } inline void cuda_log_softmax_reduce(size_t Gr, size_t Bl, double *y, const double *x, MatrixDim d, int src_stride) { cudaD_log_softmax_reduce(Gr,Bl,y,x,d,src_stride); } inline void cuda_regularize_l1(dim3 Gr, dim3 Bl, double *wei, double *grad, double l1, double lr, MatrixDim d, int stride_grad) { cudaD_regularize_l1(Gr,Bl,wei,grad,l1,lr,d,stride_grad); } -inline void cuda_find_row_max_id(dim3 Gr, dim3 Bl, const double *mat, double *vec_val, int32_cuda *vec_id, int32_cuda voff, MatrixDim d) { cudaD_find_row_max_id(Gr,Bl,mat,vec_val,vec_id,voff,d); } +inline void cuda_find_row_max_id(dim3 Gr, dim3 Bl, const double *mat, double *vec_val, int32_cuda *vec_id, MatrixDim d) { cudaD_find_row_max_id(Gr,Bl,mat,vec_val,vec_id,d); } inline void cuda_diff_xent(dim3 Gr, dim3 Bl, const int32_cuda *vec_tgt, double *mat_net_out, double *vec_log_post, MatrixDim d) { cudaD_diff_xent(Gr,Bl,vec_tgt,mat_net_out,vec_log_post,d); } @@ -454,7 +444,8 @@ inline void cuda_copy_from_sp(dim3 Gr, dim3 Bl, const double* x, double* y, Matr inline void cuda_take_lower(dim3 Gr, dim3 Bl, const double* x, double* y, MatrixDim d_in) { cudaD_take_lower(Gr,Bl,x,y,d_in); } inline void cuda_take_upper(dim3 Gr, dim3 Bl, const double* x, double* y, MatrixDim d_in) { cudaD_take_upper(Gr,Bl,x,y,d_in); } inline void cuda_take_mean(dim3 Gr, dim3 Bl, const double* x, double* y, MatrixDim d_in) { cudaD_take_mean(Gr,Bl,x,y,d_in); } -inline void cuda_matrix_add_elements(dim3 Gr, dim3 Bl, double *data, MatrixDim dim, double alpha, MatrixElement* x, int s) { cudaD_matrix_add_elements(Gr, Bl, data, dim, alpha, x, s); } +inline void cuda_matrix_add_elements(dim3 Gr, dim3 Bl, double *data, MatrixDim dim, double alpha, MatrixElement* x, int num_elements) { cudaD_matrix_add_elements(Gr, Bl, data, dim, alpha, x, num_elements); } +inline void cuda_matrix_add_indexed_values(dim3 Gr, dim3 Bl, MatrixDim dim, double alpha, const Int32Pair* indices, const double* x, int s, double* data) { cudaD_matrix_add_indexed_values(Gr, Bl, dim, alpha, indices, x, s, data); } inline void cuda_comp_obj_deriv(dim3 Gr, dim3 Bl, MatrixElement* x, int32 size, const double* z, MatrixDim d, double* z2, MatrixDim d2, double* t) {cudaD_comp_obj_deriv(Gr,Bl,x,size,z,d,z2,d2,t); } inline void cuda_sum_column_ranges(dim3 Gr, dim3 Bl, double *data, MatrixDim dim, const double *src_data, MatrixDim src_dim, const Int32Pair *indices) { @@ -471,23 +462,23 @@ inline void cuda_matrix_lookup(dim3 Gr, dim3 Bl, const double *data, cudaD_matrix_lookup(Gr, Bl, data, dim, indices, indices_size, output); } -inline void cuda_equal_element_mask(dim3 Gr, dim3 Bl, const double *mat1, const double *mat2, double *mask, +inline void cuda_equal_element_mask(dim3 Gr, dim3 Bl, const double *mat1, const double *mat2, double *mask, MatrixDim mat1_dim, int mat2_stride, int mask_stride) { cudaD_equal_element_mask(Gr, Bl, mat1, mat2, mask, mat1_dim, mat2_stride, mask_stride); } // Also include some template-friendly wrappers of cublas functions: -inline void cuda_axpy(int n, float alpha, const float *x, int incx, float *y, int incy) { - cublasSaxpy(n, alpha, x, incx, y, incy); +inline cublasStatus_t cuda_axpy(cublasHandle_t handle, int n, float alpha, const float *x, int incx, float *y, int incy) { + return cublasSaxpy_v2(handle, n, &alpha, x, incx, y, incy); } -inline void cuda_axpy(int n, double alpha, const double *x, int incx, double *y, int incy) { - cublasDaxpy(n, alpha, x, incx, y, incy); +inline cublasStatus_t cuda_axpy(cublasHandle_t handle, int n, double alpha, const double *x, int incx, double *y, int incy) { + return cublasDaxpy_v2(handle, n, &alpha, x, incx, y, incy); } -inline void cuda_scal(int n, float alpha, float *x, int incx) { - cublasSscal(n, alpha, x, incx); +inline cublasStatus_t cuda_scal(cublasHandle_t handle, int n, float alpha, float *x, int incx) { + return cublasSscal_v2(handle, n, &alpha, x, incx); } -inline void cuda_scal(int n, double alpha, double *x, int incx) { - cublasDscal(n, alpha, x, incx); +inline cublasStatus_t cuda_scal(cublasHandle_t handle, int n, double alpha, double *x, int incx) { + return cublasDscal_v2(handle, n, &alpha, x, incx); } diff --git a/src/cudamatrix/cu-math-test.cc b/src/cudamatrix/cu-math-test.cc index 2dae3bcb7b5..c36cb88f6f6 100644 --- a/src/cudamatrix/cu-math-test.cc +++ b/src/cudamatrix/cu-math-test.cc @@ -1,4 +1,4 @@ -// cudamatrix/cuda-math-test.cc +// cudamatrix/cu-math-test.cc // Copyright 2013 Johns Hopkins University (Author: David Snyder) diff --git a/src/cudamatrix/cu-math.h b/src/cudamatrix/cu-math.h index 453cf4439fb..65a4c0c4af3 100644 --- a/src/cudamatrix/cu-math.h +++ b/src/cudamatrix/cu-math.h @@ -1,7 +1,7 @@ // cudamatrix/cu-math.h // Copyright 2009-2012 Karel Vesely -// 2013 Johns Hopkins University (Author: David Snyder) +// 2013 Johns Hopkins University (Author: David Snyder) // See ../../COPYING for clarification regarding multiple authors // @@ -28,9 +28,9 @@ #include "base/timer.h" namespace kaldi { - + namespace cu { - + /// RegularizeL1 is a gradient step with l1 regularization added to the /// gradient. We don't let the value cross over zero from positive to negative /// or vice versa, in a single step. If an element tries to cross zero and is @@ -40,9 +40,9 @@ void RegularizeL1(CuMatrixBase *weight, CuMatrixBase *gradient, Real l1_penalty, Real learning_rate); /// Copies a permutation of src into tgt. The row permutation is specified in -/// copy_from_idx such that src.Row(copy_from_idx[r]) == tgt.Row(r). The +/// copy_from_idx such that src.Row(copy_from_idx[r]) == tgt.Row(r). The /// dimensions of copy_from_idx must be equivalent to the number of rows in -/// tgt and src and all elements in the vector must be in [0, src.numRows()-1]. +/// tgt and src and all elements in the vector must be in [0, src.numRows()-1]. template void Randomize(const CuMatrixBase &src, const CuArray ©_from_idx, @@ -52,10 +52,10 @@ void Randomize(const CuMatrixBase &src, /// The dimensions of tgt must be equivalent to the number of rows in src /// and it must be that tgt.NumColumns == src.NumColumns * frame_offsets.Dim(). /// As a result, tgt(i, k*n_cols + j) == src(i + frame_offsets[k], j) for the -/// general case where i in [0..src.NumRows()-1], -/// k in [0..frame_offsets.Dim()-1], j in [0..src.NumRows()-1] +/// general case where i in [0..src.NumRows()-1], +/// k in [0..frame_offsets.Dim()-1], j in [0..src.NumRows()-1] /// and n_cols = src.NumColumns(). If i + frame_offsets[k] is greater than the -/// number of rows in src or less than 0 than the right side of the equation +/// number of rows in src or less than 0 than the right side of the equation /// is replaced by src(src.NumRows()-1, j) or src(0, j) respectively, to avoid /// an index out of bounds. template @@ -73,6 +73,13 @@ void Copy(const CuMatrixBase &src, const CuArray ©_from_indices, CuMatrixBase *tgt); +template +void Group2norm(const CuMatrixBase &src, + CuMatrixBase *dest, + int32 group_stride); + + + } // namespace cu } // namespace kaldi diff --git a/src/cudamatrix/cu-matrix-inl.h b/src/cudamatrix/cu-matrix-inl.h index a37e38bcd17..9b7a707d2e5 100644 --- a/src/cudamatrix/cu-matrix-inl.h +++ b/src/cudamatrix/cu-matrix-inl.h @@ -38,15 +38,35 @@ inline CuSubMatrix::CuSubMatrix(const CuMatrixBase &mat, KALDI_ASSERT(row_offset >= 0 && col_offset >= 0 && row_offset + num_rows <= mat.num_rows_ && col_offset + num_cols <= mat.num_cols_); - this->data_ = mat.data_ + (row_offset * mat.stride_) + col_offset; + this->data_ = mat.data_ + static_cast(col_offset) + + static_cast(row_offset) * static_cast(mat.stride_); this->num_cols_ = num_cols; this->num_rows_ = num_rows; this->stride_ = mat.stride_; } } - + +template +inline CuSubMatrix::CuSubMatrix(const Real *data, + const MatrixIndexT num_rows, + const MatrixIndexT num_cols, + const MatrixIndexT stride): + CuMatrixBase(const_cast(data), num_rows, num_cols, stride) { + // in general if you use SubMatrix or CuSubMatrix, const-correctness is not + // preserved (preserving it would require us duplicating the class and it + // would have been a hassle). + + // Note: we used to check that stride >= num_cols. We no longer check for + // this as there are some situations where having stride < num_cols is useful, + // but beware because most if not all CUBLAS calls will crash when given + // such an input, even in a situation where it makes sense. + KALDI_ASSERT((num_rows != 0) == (num_cols != 0) && stride >= 0 && + num_rows >= 0 && num_cols >= 0 && stride >= 0); +} + + } // namespace kaldi #endif - + diff --git a/src/cudamatrix/cu-matrix-speed-test.cc b/src/cudamatrix/cu-matrix-speed-test.cc index 1ef970b9272..1052733b045 100644 --- a/src/cudamatrix/cu-matrix-speed-test.cc +++ b/src/cudamatrix/cu-matrix-speed-test.cc @@ -40,7 +40,102 @@ template std::string NameOf() { return (sizeof(Real) == 8 ? "" : ""); } - + +template void TestCuMatrixTransposeNS(int32 dim) { + BaseFloat time_in_secs = 0.025; + CuMatrix M(dim, dim / 2); + M.SetRandn(); + + Timer tim; + int32 iter = 0; + for (; tim.Elapsed() < time_in_secs; iter++) { + M.Transpose(); + } + BaseFloat fdim = dim; + BaseFloat gflops = (fdim * fdim * iter / 2) / (tim.Elapsed() * 1.0e+09); + KALDI_LOG<< "For CuMatrix::TransposeNS" << NameOf() << ", for dim = " + << dim << ", speed was " << gflops << " gigaflops."; +} + +template void TestCuMatrixTransposeS(int32 dim) { + BaseFloat time_in_secs = 0.025; + CuMatrix M(dim, dim); + M.SetRandn(); + + Timer tim; + int32 iter = 0; + for (; tim.Elapsed() < time_in_secs; iter++) { + M.Transpose(); + } + BaseFloat fdim = dim; + BaseFloat gflops = (fdim * fdim * iter) / (tim.Elapsed() * 1.0e+09); + KALDI_LOG<< "For CuMatrix::TransposeS" << NameOf() << ", for dim = " + << dim << ", speed was " << gflops << " gigaflops."; +} + +template void TestCuMatrixTransposeCross(int32 dim) { + BaseFloat time_in_secs = 0.025; + CuMatrix Mf(dim / 2, dim), ref(dim, dim / 2); + CuMatrix Md(dim, dim / 2); + Mf.SetRandn(); + ref = Mf; + + Timer tim; + int32 iter = 0; + for (; tim.Elapsed() < time_in_secs; iter++) { + Md.CopyFromMat(Mf, kTrans); + Mf.CopyFromMat(Md, kTrans); + } + BaseFloat fdim = dim; + BaseFloat gflops = (fdim * fdim * iter) / (tim.Elapsed() * 1.0e+09); + KALDI_LOG<< "For CuMatrix::TransposeCross" << NameOf() << ", for dim = " + << dim << ", speed was " << gflops << " gigaflops."; + + AssertEqual(ref, Mf); +} + +template void TestCuMatrixAddMat(int32 dim, + int32 num_row_blocks, int32 num_col_blocks) { + BaseFloat time_in_secs = 0.025; + CuMatrix A(dim, dim), B(dim * num_row_blocks, dim * num_col_blocks); + A.SetRandn(); + B.SetRandn(); + Timer tim; + int32 iter = 0; + for (;tim.Elapsed() < time_in_secs; iter++) { + for (int32 i = 0; i < num_row_blocks; i++) { + for (int32 j = 0; j < num_col_blocks; j++) { + A.AddMat(0.0, CuSubMatrix(B, i * dim, dim, j * dim, dim)); + } + } + } + BaseFloat fdim = dim; + BaseFloat gflops = (fdim * fdim * num_row_blocks * num_col_blocks * iter) + / (tim.Elapsed() * 1.0e+09); + KALDI_LOG << "For CuMatrix::AddMat" << NameOf() << ", for dim = " + << dim << "numRowBlocks = "<< num_row_blocks << "numColBlocks = " + << num_col_blocks << ", speed was " << gflops << " gigaflops."; +} + +template void TestCuMatrixAddMatBlocks(int32 dim, + int32 num_row_blocks, int32 num_col_blocks) { + BaseFloat time_in_secs = 0.025; + CuMatrix A(dim, dim), B(dim * num_row_blocks, dim * num_col_blocks); + A.SetRandn(); + B.SetRandn(); + Timer tim; + int32 iter = 0; + for (;tim.Elapsed() < time_in_secs; iter++) { + A.AddMatBlocks(0.0, B); + } + BaseFloat fdim = dim; + BaseFloat gflops = (fdim * fdim * num_row_blocks * num_col_blocks * iter) + / (tim.Elapsed() * 1.0e+09); + KALDI_LOG << "For CuMatrix::AddMatBlocks" << NameOf() << ", for dim = " + << dim << ", numRowBlocks = "<< num_row_blocks << ", numColBlocks = " + << num_col_blocks << ", speed was " << gflops << " gigaflops."; +} + template void TestCuMatrixMatMat(int32 dim) { BaseFloat time_in_secs = 0.025; CuMatrix M(dim, dim), N(dim, dim), O(dim, dim); @@ -58,6 +153,42 @@ template void TestCuMatrixMatMat(int32 dim) { << dim << ", speed was " << gflops << " gigaflops."; } +template void TestCuMatrixMatMatBatched(int32 dim, int32 batchCount) { + std::vector* > a(batchCount), b(batchCount), c(batchCount); + std::vector* > A, B, C; + + for (int32 i = 0; i < batchCount; i++) { + // first create a Matrix intance and then creat a SubMatrix instance from that + a[i] = new CuMatrix(dim, dim); + b[i] = new CuMatrix(dim, dim); + c[i] = new CuMatrix(dim, dim); + a[i]->SetRandn(); + b[i]->SetRandn(); + A.push_back(new CuSubMatrix(*(a[i]), 0, a[i]->NumRows(), 0, + a[i]->NumCols())); + B.push_back(new CuSubMatrix(*(b[i]), 0, b[i]->NumRows(), 0, + b[i]->NumCols())); + C.push_back(new CuSubMatrix(*(c[i]), 0, c[i]->NumRows(), 0, + c[i]->NumCols())); + } + BaseFloat time_in_secs = 0.025; + Timer tim; + int32 iter = 0; + for (;tim.Elapsed() < time_in_secs; iter++) { + AddMatMatBatched(static_cast(1.0), C, A, kNoTrans, B, kNoTrans, + static_cast(0.0)); + } + for (int32 i = 0; i< batchCount; i++) { + delete a[i]; delete b[i]; delete c[i]; + delete A[i]; delete B[i]; delete C[i]; + } + + BaseFloat fdim = dim; + BaseFloat gflops = (fdim * fdim * fdim * iter * batchCount) / (tim.Elapsed() * 1.0e+09); + KALDI_LOG << "For CuMatrix::AddMatMatBatched" << NameOf() << ", for dim = " << dim + << ", batchSize = " << batchCount << ", speed was " << gflops << " gigaflops."; +} + template void TestCuMatrixAddDiagVecMat(int32 dim, MatrixTransposeType trans) { BaseFloat time_in_secs = 0.015; CuMatrix M(dim, dim), N(dim, dim); @@ -85,14 +216,14 @@ template void TestSymInvertPosDef(int32 dim) { M.SetRandn(); N.SymAddMat2(1.0, M, kNoTrans, 0.0); CuMatrix Ncopy(N); - + int iter = 0; Timer tim; for (;tim.Elapsed() < time_in_secs; iter++) { Ncopy.CopyFromMat(N); Ncopy.SymInvertPosDef(); } - + BaseFloat fdim = dim; BaseFloat gflops = (fdim * fdim * fdim * iter) / (tim.Elapsed() * 1.0e+09); KALDI_LOG << "For CuMatrix::TestCuInvertPosDef" << NameOf() << ", for dim = " @@ -100,9 +231,9 @@ template void TestSymInvertPosDef(int32 dim) { } -template +template static void TestCuMatrixCompObjfAndDeriv(int32 dim) { - BaseFloat time_in_secs = 0.025; + BaseFloat time_in_secs = 0.025; // Previously tested for larger dims, but test was slow. int32 n_r = dim, n_c = dim + Rand() % 5; @@ -111,7 +242,7 @@ static void TestCuMatrixCompObjfAndDeriv(int32 dim) { B.SetRandn(); B.Add(1.0); B.ApplyFloor(1.0e-10); - + std::vector > labels; for(int i = 0; i < n_r; i++) { for(int j = 0; j < n_c; j++) { @@ -135,7 +266,7 @@ static void TestCuMatrixCompObjfAndDeriv(int32 dim) { KALDI_LOG << "For CuMatrix::CompObjfAndDeriv" << NameOf() << ", for dim = " << dim << ", speed was " << gflops << " gigaflops."; - + // do it one more time for correctness test. C.SetZero(); C.CompObjfAndDeriv(labels, B, &a, &b); @@ -144,30 +275,30 @@ static void TestCuMatrixCompObjfAndDeriv(int32 dim) { // repeat the real test. Real sum2; // sum(i, j) A(i, j) log(B(i, j)); - { + { CuMatrix Bcopy(B); Bcopy.ApplyLog(); sum2 = TraceMatMat(Bcopy, A, kTrans); } - + KALDI_ASSERT(ApproxEqual(a, sum2)); B.InvertElements(); A.MulElements(B); // each element of A is now A(i, j) / B(i, j); KALDI_ASSERT(ApproxEqual(A, C)); - + } -template +template static void TestCuFindRowMaxId(int32 dim) { int32 dimM = dim, dimN = dimM + Rand() % 5; Matrix Hi(dimM, dimN); Hi.SetRandn(); - + CuMatrix Di(dimM, dimN); Di.CopyFromMat(Hi); @@ -186,7 +317,7 @@ static void TestCuFindRowMaxId(int32 dim) { KALDI_LOG << "For CuMatrix::FindRowMaxId" << NameOf() << ", for dim = " << dim << ", speed was " << gflops << " gigaflops."; - + // on cpu for(MatrixIndexT r=0; r void TestCuMatrixSigmoid(int32 dim) { << dim << ", speed was " << gflops << " gigaflops."; } +template void TestCuMatrixHeaviside(int32 dim) { + BaseFloat time_in_secs = 0.025; + CuMatrix M(dim, dim), N(dim, dim); + M.SetRandn(); + N.SetRandn(); + Timer tim; + int32 iter = 0; + for (;tim.Elapsed() < time_in_secs; iter++) { + N.ApplyHeaviside(); + } + + BaseFloat fdim = dim; + BaseFloat gflops = (fdim * fdim * iter) / (tim.Elapsed() * 1.0e+09); + KALDI_LOG << "For CuMatrix::Heaviside" << NameOf() << ", for dim = " + << dim << ", speed was " << gflops << " gigaflops."; +} + template void TestCuMatrixMulRowsGroupMat(int32 dim) { BaseFloat time_in_secs = 0.025; @@ -298,7 +446,7 @@ template void TestCuMatrixGroupPnormDeriv(int32 dim) { int32 group_size = 4; CuMatrix M(dim, dim), N(dim, dim / group_size), O(dim, dim); M.SetRandn(); - N.GroupPnorm(M, 2.0); + N.GroupPnorm(M, 2.0); Timer tim; int32 iter = 0; @@ -348,8 +496,8 @@ template void TestCuMatrixGroupMaxDeriv(int32 dim) { template void TestCuMatrixTraceMatMat(int32 dim) { for (int32 n = 0; n < 2; n++) { MatrixTransposeType trans = (n == 0 ? kNoTrans : kTrans); - BaseFloat time_in_secs = 0.08; - + BaseFloat time_in_secs = 0.02; + CuMatrix M(dim, dim), N(dim, dim); M.SetRandn(); N.SetRandn(); @@ -360,7 +508,7 @@ template void TestCuMatrixTraceMatMat(int32 dim) { } BaseFloat fdim = dim; BaseFloat gflops = (fdim * fdim * iter) / (tim.Elapsed() * 1.0e+09); - KALDI_LOG << "For CuMatrix::TraceMatMat" << NameOf() + KALDI_LOG << "For CuMatrix::TraceMatMat" << NameOf() << (trans == kTrans ? " [transposed]" : "") << ", for dim = " << dim << ", speed was " << gflops << " gigaflops."; } @@ -368,10 +516,10 @@ template void TestCuMatrixTraceMatMat(int32 dim) { template void TestCuMatrixCholesky(int32 dim) { - BaseFloat time_in_secs = 0.08; - + BaseFloat time_in_secs = 0.025; + CuMatrix M(dim, dim); - M.AddToDiag(100.0); + M.AddToDiag(100.0); Timer tim; int32 iter = 0; for (;tim.Elapsed() < time_in_secs; iter++) @@ -379,7 +527,7 @@ template void TestCuMatrixCholesky(int32 dim) { BaseFloat fdim = dim; BaseFloat gflops = (fdim * fdim * fdim * iter) / (tim.Elapsed() * 1.0e+09); - KALDI_LOG << "For CuMatrix::Cholesky" << NameOf() + KALDI_LOG << "For CuMatrix::Cholesky" << NameOf() << ", for dim = " << dim << ", speed was " << gflops << " gigaflops."; } @@ -418,7 +566,7 @@ template void TestCuMatrixCopyFromTp(int32 dim, MatrixTransposeTy Matrix M_cpu(T_cpu, trans); Matrix M2_cpu(M); AssertEqual(M_cpu, M2_cpu); - + BaseFloat fdim = dim; BaseFloat gflops = (fdim * fdim * iter) / (tim.Elapsed() * 1.0e+09); KALDI_LOG << "For CuMatrix::CopyFromTp" << (trans == kNoTrans ? "[NoTrans]":"[Trans]") @@ -442,7 +590,7 @@ template void TestCuMatrixCopyFromSp(int32 dim) { Matrix M_cpu(S_cpu); Matrix M2_cpu(M); AssertEqual(M_cpu, M2_cpu); - + BaseFloat fdim = dim; BaseFloat gflops = (fdim * fdim * iter) / (tim.Elapsed() * 1.0e+09); KALDI_LOG << "For CuMatrix::CopyFromSp" << NameOf() << ", for dim = " @@ -469,6 +617,19 @@ template void TestCuMatrixCopyUpperToLower(int32 dim) { } +template void TestCuMatrixResize(int32 dim) { + BaseFloat time_in_secs = 0.025; + Timer tim; + int32 iter = 0; + for (; tim.Elapsed() < time_in_secs; iter++) { + CuMatrixM(dim, dim, kUndefined); // we are testing the allocation and deallocation time. + } + BaseFloat fdim = dim; + BaseFloat gflops = (fdim * fdim * iter) / (tim.Elapsed() * 1.0e+09); + KALDI_LOG << "For CuMatrix::TestCuMatrixResize" << NameOf() << ", for dim = " + << dim << ", speed was " << gflops << " gigaflops."; +} + template void TestCuMatrixSetZeroAboveDiag(int32 dim) { BaseFloat time_in_secs = 0.025; CuMatrix M(dim, dim); @@ -482,9 +643,10 @@ template void TestCuMatrixSetZeroAboveDiag(int32 dim) { KALDI_LOG << "For CuMatrix::SetZeroAboveDiag" << NameOf() << ", for dim = " << dim << ", speed was " << gflops << " gigaflops."; } -template + +template void TestCuMatrixLookup(int32 dim) { - BaseFloat time_in_secs = 0.025; + BaseFloat time_in_secs = 0.025; int32 dimM = dim, dimN = dim; CuMatrix H(dimM, dimN); H.SetRandn(); @@ -493,6 +655,7 @@ void TestCuMatrixLookup(int32 dim) { std::vector output; // Generates the indices and the reference. int32 num_index = dim * dim; + output.resize(num_index); for (int32 j = 0; j < num_index; j++) { MatrixIndexT r = Rand() % dimM; MatrixIndexT c = Rand() % dimN; @@ -506,12 +669,12 @@ void TestCuMatrixLookup(int32 dim) { Timer tim; int32 iter = 0; for (; tim.Elapsed()< time_in_secs; iter++) - H.Lookup(indices, &output); + H.Lookup(indices, &(output[0])); - BaseFloat fdim = dim; + BaseFloat fdim = dim; BaseFloat gflops = (fdim * fdim * iter) / (tim.Elapsed() * 1.0e+09); - KALDI_LOG << "For CuMatrix::Lookup" << NameOf() << ", for dim = " - << dim << ", speed was " << gflops << " gigaflops."; + KALDI_LOG << "For CuMatrix::Lookup" << NameOf() << ", for dim = " + << dim << ", speed was " << gflops << " gigaflops."; } template void TestCuMatrixCopyRows1(int32 dim) { @@ -693,8 +856,16 @@ template void CudaMatrixSpeedTest() { sizes.push_back(512); sizes.push_back(1024); int32 ns = sizes.size(); + for (int32 s = 0; s < ns; s++) + TestCuMatrixResize(sizes[s]); + for (int32 s = 0; s < ns; s++) + TestCuMatrixAddMat(sizes[s], 3, 3); + for (int32 s = 0; s < ns; s++) + TestCuMatrixAddMatBlocks(sizes[s], 3, 3); for (int32 s = 0; s < ns; s++) TestCuMatrixMatMat(sizes[s]); + for (int32 s = 0; s < ns; s++) + TestCuMatrixMatMatBatched(sizes[s], 10); for (int32 s = 0; s < ns; s++) { TestCuMatrixAddDiagVecMat(sizes[s], kNoTrans); TestCuMatrixAddDiagVecMat(sizes[s], kTrans); @@ -705,6 +876,8 @@ template void CudaMatrixSpeedTest() { TestCuMatrixCholesky(sizes[s]); for (int32 s = 0; s < ns; s++) TestCuMatrixSigmoid(sizes[s]); + for (int32 s = 0; s < ns; s++) + TestCuMatrixHeaviside(sizes[s]); for (int32 s = 0; s < ns; s++) TestCuFindRowMaxId(sizes[s]); for (int32 s = 0; s < ns; s++) @@ -737,22 +910,28 @@ template void CudaMatrixSpeedTest() { TestCuMatrixCopyUpperToLower(sizes[s]); for (int32 s = 0; s < ns; s++) TestCuMatrixSetZeroAboveDiag(sizes[s]); - for (int32 s = 0; s < ns; s++) + for (int32 s = 0; s < ns; s++) TestCuMatrixLookup(sizes[s]); - for (int32 s = 0; s < ns; s++) + for (int32 s = 0; s < ns; s++) TestCuMatrixCopyRows1(sizes[s]); - for (int32 s = 0; s < ns; s++) + for (int32 s = 0; s < ns; s++) TestCuMatrixCopyRows2(sizes[s]); - for (int32 s = 0; s < ns; s++) + for (int32 s = 0; s < ns; s++) TestCuMatrixCopyToRows(sizes[s]); - for (int32 s = 0; s < ns; s++) + for (int32 s = 0; s < ns; s++) TestCuMatrixAddRows1(sizes[s]); - for (int32 s = 0; s < ns; s++) + for (int32 s = 0; s < ns; s++) TestCuMatrixAddRows2(sizes[s]); - for (int32 s = 0; s < ns; s++) + for (int32 s = 0; s < ns; s++) TestCuMatrixAddToRows(sizes[s]); - for (int32 s = 0; s < ns; s++) + for (int32 s = 0; s < ns; s++) TestCuMatrixAddRowRanges(sizes[s]); + for (int32 s = 0; s < ns; s++) + TestCuMatrixTransposeCross(sizes[s]); + for (int32 s = 0; s < ns; s++) + TestCuMatrixTransposeS(sizes[s]); + for (int32 s = 0; s < ns; s++) + TestCuMatrixTransposeNS(sizes[s]); } diff --git a/src/cudamatrix/cu-matrix-test.cc b/src/cudamatrix/cu-matrix-test.cc index e54047e7262..74419ea25ba 100644 --- a/src/cudamatrix/cu-matrix-test.cc +++ b/src/cudamatrix/cu-matrix-test.cc @@ -1,9 +1,9 @@ -// cudamatrix/cuda-matrix-test.cc +// cudamatrix/cu-matrix-test.cc // Copyright 2010 Karel Vesely // 2013 Lucas Ondel // 2013 Johns Hopkins University (author: Daniel Povey) -// 2013 Hainan Xu +// 2013 Hainan Xu // 2013 Xiaohui Zhang // 2013 Johns Hopkins University (author: Guoguo Chen) @@ -39,7 +39,7 @@ namespace kaldi { /* * INITIALIZERS */ -template +template static void InitRand(VectorBase *v) { for (MatrixIndexT i = 0; i < v->Dim(); i++) (*v)(i) = RandGauss(); @@ -47,7 +47,7 @@ static void InitRand(VectorBase *v) { -template +template static void InitRand(MatrixBase *M) { do { for (MatrixIndexT i = 0;i < M->NumRows();i++) @@ -58,7 +58,7 @@ static void InitRand(MatrixBase *M) { -template +template static void RandZeroToOneMatrix(MatrixBase* mat) { for(int32 r=0; rNumRows(); r++) for(int32 c=0; cNumCols(); c++) @@ -70,7 +70,7 @@ static void RandZeroToOneMatrix(MatrixBase* mat) { * Unit tests */ -template +template static void UnitTestCuMatrixTraceMatMat() { for (int32 i = 0; i < 2; i++) { int32 M = 100 + Rand() % 200, N = 100 + Rand() % 200; @@ -102,11 +102,11 @@ static void UnitTestCuMatrixTraceMatMat() { } -template +template static void UnitTestCuCholesky() { for (int32 i = 0; i < 2; i++) { int32 M = 1 + Rand() % 10, N = M + 5; - + CuMatrix A(M, N); A.SetRandn(); CuMatrix S(M, M); @@ -133,7 +133,7 @@ static void UnitTestCuCholesky() { /* * CuMatrix */ -template +template static void UnitTestCuMatrixApplyLog() { int32 M = 100 + Rand() % 200, N = 100 + Rand() % 200; Matrix H(M, N); @@ -154,7 +154,7 @@ static void UnitTestCuMatrixApplyLog() { /* * CuMatrix */ -template +template static void UnitTestCuMatrixApplyExp() { int32 M = 10 + Rand() % 20, N = 10 + Rand() % 20; Matrix H(M, N); @@ -173,7 +173,7 @@ static void UnitTestCuMatrixApplyExp() { -template +template static void UnitTestCuMatrixSigmoid() { for (int32 i = 0; i < 2; i++) { int32 M = 100 + Rand() % 200, N = 100 + Rand() % 200; @@ -193,7 +193,7 @@ static void UnitTestCuMatrixSigmoid() { } } -template +template static void UnitTestCuMatrixScale() { int32 M = 100 + Rand() % 200, N = 100 + Rand() % 200; Matrix H(M, N); @@ -208,7 +208,7 @@ static void UnitTestCuMatrixScale() { AssertEqual(H, E); } -template +template static void UnitTestCuMatrixAdd() { int32 M = 100 + Rand() % 200, N = 100 + Rand() % 200; Matrix H(M, N); @@ -224,7 +224,7 @@ static void UnitTestCuMatrixAdd() { } -template +template static void UnitTestCuMatrixSoftHinge() { int32 M = 100 + Rand() % 200, N = 100 + Rand() % 200; Matrix H(M, N); @@ -236,13 +236,13 @@ static void UnitTestCuMatrixSoftHinge() { E.SoftHinge(D); H.SoftHinge(H); - + Matrix H2(E); AssertEqual(H,H2); } -template +template static void UnitTestCuMatrixGroupPnorm() { int32 M = 100 + Rand() % 200, N = 100 + Rand() % 200; // M = 256; N = 256; @@ -266,7 +266,7 @@ static void UnitTestCuMatrixGroupPnorm() { } } -template +template static void UnitTestCuMatrixGroupMax() { int32 M = 100 + Rand() % 200, N = 100 + Rand() % 200; // M = 256; N = 256; @@ -287,7 +287,7 @@ static void UnitTestCuMatrixGroupMax() { } } -template +template static void UnitTestCuMatrixSet() { for (int32 i = 0; i < 2; i++) { BaseFloat value= 0.333; @@ -302,20 +302,20 @@ static void UnitTestCuMatrixSet() { } -template +template static void UnitTestCuMatrixApplyPow() { for (int32 i = 0; i < 2; i++) { BaseFloat pow = 0.5 * (Rand() % 6); - + Matrix H(10 + Rand() % 60, 10 + Rand() % 20); H.SetRandn(); H.Row(0).Set(0.0); if (i == 2) { Matrix tmp(H, kTrans); H = tmp; } - + if (pow != 1.0 && pow != 2.0 && pow != 3.0) H.MulElements(H); //make numbers positive - + CuMatrix cH(H); cH.ApplyPow(pow); @@ -326,17 +326,17 @@ static void UnitTestCuMatrixApplyPow() { } } -template +template static void UnitTestCuMatrixApplyPowAbs() { for (int32 i = 0; i < 2; i++) { BaseFloat pow = 0.5 * (Rand() % 6); - + Matrix H(10 + Rand() % 60, 10 + Rand() % 20); H.SetRandn(); H.Row(0).Set(0.0); if (i == 2) { Matrix tmp(H, kTrans); H = tmp; } - + CuMatrix cH(H); cH.ApplyPowAbs(pow, true); @@ -383,7 +383,7 @@ static void UnitTestCuMatrixCopyRows() { num_cols = 10 + Rand() % 10; CuMatrix M(num_rows1, num_cols); M.SetRandn(); - + CuMatrix N1(num_rows2, num_cols), N2(num_rows2, num_cols), O(num_rows2, num_cols); std::vector reorder(num_rows2); @@ -404,7 +404,7 @@ static void UnitTestCuMatrixCopyRows() { for (int32 j = 0; j < num_cols; j++) if (reorder[i] < 0) O(i, j) = 0; else O(i, j) = M(reorder[i], j); - + AssertEqual(N1, O); AssertEqual(N2, O); } @@ -452,7 +452,7 @@ static void UnitTestCuMatrixAddRows() { num_rows2 = 10 + Rand() % 10, num_cols = 10 + Rand() % 10; CuMatrix M(num_rows1, num_cols); - M.SetRandn(); + M.SetRandn(); CuMatrix N1(num_rows2, num_cols), N2(num_rows2, num_cols), O(num_rows2, num_cols); @@ -592,7 +592,7 @@ static void UnitTestCuMatrixSumColumnRanges() { } CuMatrix cu_src(src); CuMatrix cu_dst(num_rows, num_cols2, kUndefined); - CuArray indices_tmp(indices); + CuArray indices_tmp(indices); cu_dst.SumColumnRanges(cu_src, indices_tmp); Matrix dst2(cu_dst); AssertEqual(dst, dst2); @@ -610,8 +610,8 @@ static void UnitTestCuMatrixAddRowRanges() { Matrix dst(num_rows2, num_cols); dst.SetRandn(); // Computes the indexes. - std::vector indexes(num_cols); - for (MatrixIndexT i = 0; i < num_cols; i++) { + std::vector indexes(num_rows2); + for (MatrixIndexT i = 0; i < num_rows2; i++) { indexes[i].first = Rand() % num_rows1; int32 headroom = num_rows1 - indexes[i].first, size = (Rand() % headroom) + 1; @@ -620,12 +620,11 @@ static void UnitTestCuMatrixAddRowRanges() { indexes[i].second <= num_rows1 && indexes[i].first >= 0); } - // Computes reference matrix. Matrix dst1(dst); for (MatrixIndexT i = 0; i < num_rows2; i++) { + int32 start = indexes[i].first, end = indexes[i].second; for (MatrixIndexT j = 0; j < num_cols; j++) { - int32 start = indexes[j].first, end = indexes[j].second; for (MatrixIndexT i2 = start; i2 < end; i2++) dst1(i, j) += src(i2, j); } @@ -640,7 +639,7 @@ static void UnitTestCuMatrixAddRowRanges() { } } - + template static void UnitTestCuMatrixCopyCols() { for (int32 p = 0; p < 2; p++) { @@ -649,7 +648,7 @@ static void UnitTestCuMatrixCopyCols() { num_rows = 10 + Rand() % 10; CuMatrix M(num_rows, num_cols1); M.SetRandn(); - + CuMatrix N(num_rows, num_cols2), O(num_rows, num_cols2); std::vector reorder(num_cols2); for (int32 i = 0; i < num_cols2; i++) @@ -657,7 +656,7 @@ static void UnitTestCuMatrixCopyCols() { CuArray reorder_gpu(reorder); N.CopyCols(M, reorder_gpu); - + for (int32 i = 0; i < num_rows; i++) for (int32 j = 0; j < num_cols2; j++) if (reorder[j] < 0) O(i, j) = 0; @@ -675,7 +674,7 @@ static void UnitTestCuMatrixAddCols() { num_rows = 10 + Rand() % 10; CuMatrix M(num_rows, num_cols1); M.SetRandn(); - + CuMatrix N(num_rows, num_cols2), O(num_rows, num_cols2); std::vector reorder(num_cols2); for (int32 i = 0; i < num_cols2; i++) @@ -683,7 +682,7 @@ static void UnitTestCuMatrixAddCols() { CuArray reorder_gpu(reorder); N.AddCols(M, reorder_gpu); - + for (int32 i = 0; i < num_rows; i++) for (int32 j = 0; j < num_cols2; j++) if (reorder[j] < 0) O(i, j) = 0; @@ -693,16 +692,16 @@ static void UnitTestCuMatrixAddCols() { } -template +template static void UnitTestCuMatrixApplyFloor() { for (int32 i = 0; i < 3; i++) { BaseFloat floor = 0.33 * (Rand() % 6); - + Matrix H(10 + Rand() % 600, 10 + Rand() % 20); H.SetRandn(); if (i == 2) { Matrix tmp(H, kTrans); H = tmp; } - + CuMatrix cH(H); cH.ApplyFloor(floor); @@ -714,8 +713,28 @@ static void UnitTestCuMatrixApplyFloor() { } } +template +static void UnitTestCuMatrixApplyCeiling() { + + for (int32 i = 0; i < 3; i++) { + BaseFloat ceiling = 0.33 * (Rand() % 6); -template + Matrix H(10 + Rand() % 600, 10 + Rand() % 20); + H.SetRandn(); + if (i == 2) { Matrix tmp(H,kTrans); H = tmp; } + + CuMatrix cH(H); + + cH.ApplyCeiling(ceiling); + + H.ApplyCeiling(ceiling); + Matrix H2(cH); + + AssertEqual(H, H2); + } +} + +template static void UnitTestCuMatrixApplyHeaviside() { for (int32 i = 0; i < 1; i++) { @@ -735,11 +754,30 @@ static void UnitTestCuMatrixApplyHeaviside() { } -template +template +static void UnitTestCuMatrixHeaviside() { + + for (int32 i = 0; i < 1; i++) { + Matrix H(10 + Rand() % 60, 10 + Rand() % 20); + H.SetRandn(); + H.Row(0).Set(0.0); + if (i == 2) { Matrix tmp(H, kTrans); H = tmp; } + + CuMatrix cH(H); + CuMatrix cH2(H.NumRows(), H.NumCols(), kUndefined); + cH2.Heaviside(cH); + H.ApplyHeaviside(); + Matrix H2(cH2); + AssertEqual(H, H2); + } +} + + +template static void UnitTestCuMatrixMulElements() { for (int32 i = 0; i < 2; i++) { MatrixIndexT dimM = 100 + Rand() % 256, dimN = 100 + Rand() % 256; - + Matrix Ha(dimM, dimN); Matrix Hb(dimM, dimN); Ha.SetRandn(); @@ -760,11 +798,11 @@ static void UnitTestCuMatrixMulElements() { } } -template +template static void UnitTestCuMatrixDivElements() { for (int32 i = 0; i < 2; i++) { MatrixIndexT dimM = 100 + Rand() % 256, dimN = 100 + Rand() % 256; - + Matrix Ha(dimM, dimN); Matrix Hb(dimM, dimN); Ha.SetRandn(); @@ -785,7 +823,7 @@ static void UnitTestCuMatrixDivElements() { } } -template +template static void UnitTestCuMatrixMax() { Matrix Ha(100,100); Matrix Hb(100,100); @@ -808,7 +846,7 @@ static void UnitTestCuMatrixMax() { -template +template static void UnitTestCuMatrixMulColsVec() { Matrix Hm(100,99); Vector Hv(99); @@ -831,7 +869,7 @@ static void UnitTestCuMatrixMulColsVec() { -template +template static void UnitTestCuMatrixMulRowsVec() { for (int32 i = 0; i < 2; i++) { int32 dimM = 100 + Rand() % 200, dimN = 100 + Rand() % 200; @@ -845,10 +883,10 @@ static void UnitTestCuMatrixMulRowsVec() { CuVector Dv(dimM); Dm.CopyFromMat(Hm); Dv.CopyFromVec(Hv); - + Dm.MulRowsVec(Dv); Hm.MulRowsVec(Hv); - + Matrix Hm2(dimM, dimN); Dm.CopyToMat(&Hm2); @@ -856,7 +894,7 @@ static void UnitTestCuMatrixMulRowsVec() { } } -template +template static void UnitTestCuMatrixMulRowsGroupMat() { for (int32 i = 0; i < 2; i++) { int32 dimM = 100 + Rand() % 200, dimNs = 100 + Rand() % 200; @@ -872,17 +910,17 @@ static void UnitTestCuMatrixMulRowsGroupMat() { CuMatrix Ds(dimM, dimNs); Dm.CopyFromMat(Hm); Ds.CopyFromMat(Hs); - + Dm.MulRowsGroupMat(Ds); Hm.MulRowsGroupMat(Hs); - + Matrix Hm2(dimM, dimN); Dm.CopyToMat(&Hm2); AssertEqual(Hm,Hm2); } } -template +template static void UnitTestCuMatrixGroupPnormDeriv() { int32 dimM = 100 + Rand() % 200, dimNs = 100 + Rand() % 200; int32 group_size = 1 + Rand() % 10; @@ -898,25 +936,25 @@ static void UnitTestCuMatrixGroupPnormDeriv() { Hm.ApplyFloor(0.0); // will put some zeros in the matrix.. harder to // do derivatives. Hs.GroupPnorm(Hm, power); - + CuMatrix Dm(dimM, dimN); CuMatrix Dr(dimM, dimN); CuMatrix Ds(dimM, dimNs); Dm.CopyFromMat(Hm); Dr.CopyFromMat(Hr); Ds.CopyFromMat(Hs); - - // KALDI_LOG << "Hr " << Hr << " Dr " << Dr << "Ds" << Ds << " Hs " << Hs ; + + // KALDI_LOG << "Hr " << Hr << " Dr " << Dr << "Ds" << Ds << " Hs " << Hs ; Dr.GroupPnormDeriv(Dm, Ds, power); Hr.GroupPnormDeriv(Hm, Hs, power); - - // KALDI_LOG << "Hr " << Hr << " Dr " << Dr << "Ds" << Ds << " Hs " << Hs ; + + // KALDI_LOG << "Hr " << Hr << " Dr " << Dr << "Ds" << Ds << " Hs " << Hs ; Matrix Hr2(dimM, dimN); Dr.CopyToMat(&Hr2); AssertEqual(Hr,Hr2); } -template +template static void UnitTestCuMatrixGroupMaxDeriv() { int32 dimM = 100 + Rand() % 200, dimNs = 100 + Rand() % 200; int32 group_size = 1 + Rand() % 10; @@ -931,19 +969,19 @@ static void UnitTestCuMatrixGroupMaxDeriv() { Hm.ApplyFloor(0.0); // will put some zeros in the matrix.. harder to // do derivatives. Hs.GroupMax(Hm); - + CuMatrix Dm(dimM, dimN); CuMatrix Dr(dimM, dimN); CuMatrix Ds(dimM, dimNs); Dm.CopyFromMat(Hm); Dr.CopyFromMat(Hr); Ds.CopyFromMat(Hs); - - // KALDI_LOG << "Hr " << Hr << " Dr " << Dr << "Ds" << Ds << " Hs " << Hs ; + + // KALDI_LOG << "Hr " << Hr << " Dr " << Dr << "Ds" << Ds << " Hs " << Hs ; Dr.GroupMaxDeriv(Dm, Ds); Hr.GroupMaxDeriv(Hm, Hs); - - // KALDI_LOG << "Hr " << Hr << " Dr " << Dr << "Ds" << Ds << " Hs " << Hs ; + + // KALDI_LOG << "Hr " << Hr << " Dr " << Dr << "Ds" << Ds << " Hs " << Hs ; Matrix Hr2(dimM, dimN); Dr.CopyToMat(&Hr2); AssertEqual(Hr,Hr2); @@ -963,7 +1001,7 @@ template static void UnitTestCuMatrixAddDiagVecMat() { KALDI_ASSERT(M.Sum() != 0.0); KALDI_ASSERT(N.Sum() != 0.0); - + CuVector V(dimM); V.SetRandn(); @@ -979,7 +1017,7 @@ template static void UnitTestCuMatrixAddDiagVecMat() { Mcheckrow.Scale(beta); Mcheckrow.AddVec(alpha * V(r), Nrow); } - + M.AddDiagVecMat(alpha, V, N, trans, beta); AssertEqual(M, Mcheck); KALDI_ASSERT(M.Sum() != 0.0); @@ -993,8 +1031,8 @@ template static void UnitTestCuMatrixAddMatDiagVec() { Real alpha = 0.43243, beta = 1.423; CuMatrix M(dimM, dimN), N(dimM, dimN), buf(dimM, dimN); - M.SetRandn(); - N.SetRandn(); + M.SetRandn(); + N.SetRandn(); buf.CopyFromMat(N); MatrixTransposeType trans = (p % 2 == 0 ? kNoTrans : kTrans); if (trans == kTrans) @@ -1003,9 +1041,9 @@ template static void UnitTestCuMatrixAddMatDiagVec() { CuVector V(dimN); V.SetRandn(); - CuMatrix Mcheck(M); + CuMatrix Mcheck(M); Mcheck.Scale(beta); - buf.MulColsVec(V); + buf.MulColsVec(V); Mcheck.AddMat(alpha, buf, kNoTrans); M.AddMatDiagVec(alpha, N, trans, V, beta); @@ -1032,7 +1070,7 @@ template static void UnitTestCuMatrixAddMatMatElements() { KALDI_ASSERT(M.Sum() != 0.0); } -template +template static void UnitTestCuMatrixDivRowsVec() { Matrix Hm(100,99); Vector Hv(100); @@ -1056,7 +1094,7 @@ static void UnitTestCuMatrixDivRowsVec() { -template +template static void UnitTestCuMatrixAddMat() { Matrix Ha(100,100); Matrix Hb(100,100); @@ -1075,15 +1113,15 @@ static void UnitTestCuMatrixAddMat() { Da.CopyToMat(&Ha2); AssertEqual(Ha,Ha2); - + //check use with submatrix CuMatrix mat1(10,10,kSetZero); mat1.AddMat(1.0,Da.Range(5,10,12,10)); //different stride for mat1,mat2 CuMatrix mat2(Da.Range(5,10,12,10)); AssertEqual(mat1,mat2); - + for (int i = 0; i < 10; i++) { - int32 N = 5 * (10 + Rand() % 10), M = 100 + Rand() % 50; + int32 N = 5 * (10 + Rand() % 10), M = 100 + Rand() % 50; Matrix Hc(N,M); Matrix Hd(M,N); Hc.SetRandn(); @@ -1093,11 +1131,11 @@ static void UnitTestCuMatrixAddMat() { CuMatrix Dd(M,N); Dc.CopyFromMat(Hc); Dd.CopyFromMat(Hd); - + Real alpha = 0.5; Dc.AddMat(alpha,Dd,kTrans); Hc.AddMat(alpha,Hd,kTrans); - + Matrix Hc2(N,M); Dc.CopyToMat(&Hc2); AssertEqual(Hc,Hc2); @@ -1105,13 +1143,46 @@ static void UnitTestCuMatrixAddMat() { // check use with submatrix CuMatrix mat3(N/5,M,kSetZero); mat3.AddMat(1.0, Dd.Range(0,M,0,N/5),kTrans); - + CuMatrix mat4(Dd.Range(0,M,0,N/5),kTrans); AssertEqual(mat3,mat4); } } -template +template +static void UnitTestCuMatrixAddMatBlocks() { + int32 num_row_blocks = 10, num_col_blocks = 20; + Matrix Ha1(100, 100), Ha2(100, 100); + Matrix Hb(100 * num_row_blocks, 100 * num_col_blocks); + Ha1.SetRandn(); + Ha2.SetRandn(); + Hb.SetRandn(); + + CuMatrix Da1(100, 100), Da2(100, 100); + CuMatrix Db(100 * num_row_blocks, 100 * num_col_blocks); + Da1.CopyFromMat(Ha1); + Da2.CopyFromMat(Ha2); + Db.CopyFromMat(Hb); + + for (int32 i = 0; i < num_row_blocks; i++) { + for (int32 j = 0; j < num_col_blocks; j++) { + SubMatrix Hs(Hb.Range(i * 100, 100, j * 100, 100)); + Ha1.AddMat(0.5, Hs, kNoTrans); + Ha2.AddMat(0.5, Hs, kTrans); + } + } + + Da1.AddMatBlocks(0.5, Db, kNoTrans); + Da2.AddMatBlocks(0.5, Db, kTrans); + Matrix Ha11(100, 100); + Da1.CopyToMat(&Ha11); + AssertEqual(Ha1,Ha11); + Matrix Ha22(100, 100); + Da2.CopyToMat(&Ha22); + AssertEqual(Ha2,Ha22); +} + +template static void UnitTestCuMatrixSum() { int32 M = 100 + Rand() % 300, N = 100 + Rand() % 300; CuMatrix A(M, N); @@ -1121,7 +1192,7 @@ static void UnitTestCuMatrixSum() { } -template +template static void UnitTestCuMatrixAddVecToCols() { Matrix Hm(100,99); Vector Hv(100); @@ -1144,7 +1215,7 @@ static void UnitTestCuMatrixAddVecToCols() { -template +template static void UnitTestCuMatrixAddVecToRows() { Matrix Hm(100,99); Vector Hv(99); @@ -1166,7 +1237,7 @@ static void UnitTestCuMatrixAddVecToRows() { } -template +template static void UnitTestCuMatrixSymAddMat2() { for (int32 i = 0; i < 2; i++) { int32 dimM = 10 + Rand() % 200, dimN = 10 + Rand() % 30; @@ -1196,7 +1267,7 @@ static void UnitTestCuMatrixSymAddMat2() { -template +template static void UnitTestCuMatrixSymInvertPosDef() { for (int32 i = 0; i < 2; i++) { int32 dimM = 10 + Rand() % 200, dimN = dimM + 20; @@ -1243,7 +1314,7 @@ static void UnitTestCuMatrixSymInvertPosDef() { } -template +template static void UnitTestCuMatrixAddMatMat() { Matrix Ha(200,100); Matrix Hb(100,200); @@ -1274,7 +1345,98 @@ static void UnitTestCuMatrixAddMatMat() { } -template +template +static void UnitTestCuMatrixAddVecVec() { + Vector x(100); + Vector y(200); + x.SetRandn(); + y.SetRandn(); + + CuVector Cux(100); + CuVector Cuy(200); + Cux.CopyFromVec(x); + Cuy.CopyFromVec(y); + + Matrix A(100,200); + CuMatrix CuA(100,200); + + A.AddVecVec(0.5f, x, y); + CuA.AddVecVec(0.5f, Cux, Cuy); + Matrix A2(100, 200); + CuA.CopyToMat(&A2); + + AssertEqual(A,A2); +} + + +template +static void UnitTestCuMatrixAddMatMatBatched() { + const int32 batchCount = 10; + std::vector* > Ha(batchCount), Hb(batchCount), Hc1(batchCount), Hc2(batchCount); + std::vector* > Da(batchCount), Db(batchCount), Dc1(batchCount), Dc2(batchCount); + std::vector* > HA, HB, HC1, HC2; + std::vector* > DA, DB, DC1, DC2; + + for (int32 i = 0; i < batchCount; i++) { + // first create a Matrix intance and then creat a SubMatrix instance from that + Ha[i] = new Matrix(200, 100); + Hb[i] = new Matrix(100, 200); + Hc1[i] = new Matrix(200, 200); + Hc2[i] = new Matrix(100, 100); + Ha[i]->SetRandn(); + Hb[i]->SetRandn(); + HA.push_back(new SubMatrix(*(Ha[i]), 0, Ha[i]->NumRows(), 0, + Ha[i]->NumCols())); + HB.push_back(new SubMatrix(*(Hb[i]), 0, Hb[i]->NumRows(), 0, + Hb[i]->NumCols())); + HC1.push_back(new SubMatrix(*(Hc1[i]), 0, Hc1[i]->NumRows(), 0, + Hc1[i]->NumCols())); + HC2.push_back(new SubMatrix(*(Hc2[i]), 0, Hc2[i]->NumRows(), 0, + Hc2[i]->NumCols())); + + // first create a CuMatrix intance and then creat a CuSubMatrix instance from that + Da[i] = new CuMatrix(200, 100); + Db[i] = new CuMatrix(100, 200); + Dc1[i] = new CuMatrix(200, 200); + Dc2[i] = new CuMatrix(100, 100); + Da[i]->CopyFromMat(*(Ha[i])); + Db[i]->CopyFromMat(*(Hb[i])); + DA.push_back(new CuSubMatrix(*(Da[i]), 0, Da[i]->NumRows(), 0, + Da[i]->NumCols())); + DB.push_back(new CuSubMatrix(*(Db[i]), 0, Db[i]->NumRows(), 0, + Db[i]->NumCols())); + DC1.push_back(new CuSubMatrix(*(Dc1[i]), 0, Dc1[i]->NumRows(), 0, + Dc1[i]->NumCols())); + DC2.push_back(new CuSubMatrix(*(Dc2[i]), 0, Dc2[i]->NumRows(), 0, + Dc2[i]->NumCols())); + } + + AddMatMatBatched(static_cast(0.5f), DC1, DA, kNoTrans, DB, kNoTrans, + static_cast(0.0f)); + AddMatMatBatched(static_cast(0.5f), DC2, DA, kTrans, DB, kTrans, + static_cast(0.0f)); + + // used to store results from DC1 and DC2 for equality check + Matrix Hca1(200,200); + Matrix Hca2(100,100); + + // equality check + for (int32 i = 0; i< batchCount; i++) { + (*HC1[i]).AddMatMat(0.5f, *(HA[i]), kNoTrans, *(HB[i]), kNoTrans, 0.0f); + (*HC2[i]).AddMatMat(0.5f, *(HA[i]), kTrans, *(HB[i]), kTrans, 0.0f); + DC1[i]->CopyToMat(&Hca1); + DC2[i]->CopyToMat(&Hca2); + AssertEqual(*(HC1[i]), Hca1); + AssertEqual(*(HC2[i]), Hca2); + delete Ha[i]; delete Hb[i]; delete Hc1[i]; delete Hc2[i]; + delete HA[i]; delete HB[i]; delete HC1[i]; delete HC2[i]; + delete Da[i]; delete Db[i]; delete Dc1[i]; delete Dc2[i]; + delete DA[i]; delete DB[i]; delete DC1[i]; delete DC2[i]; + } +} + + +template static void UnitTestCuMatrixAddToDiag() { for (int32 i = 0; i < 10; i++) { int32 dimM = 100 + Rand() % 200, dimN = 100 + Rand() % 200; @@ -1288,7 +1450,7 @@ static void UnitTestCuMatrixAddToDiag() { } } -template +template static void UnitTestCuMatrixAdd2() { for (int32 i = 0; i < 10; i++) { int32 dimM = 100 + Rand() % 200, dimN = 100 + Rand() % 200; @@ -1307,10 +1469,10 @@ template static void UnitTestCuMatrixCopyFromMat() { for (int32 i = 1; i < 10; i++) { MatrixIndexT dim = 5 * i + Rand() % 10; - + Matrix A(dim, dim); A.SetRandn(); - CuMatrix E(A); + CuMatrix E(A); CuMatrix B(dim, dim); B.CopyFromMat(E); @@ -1338,7 +1500,7 @@ template static void UnitTestCuMatrixAddMatTp() { for (int32 i = 1; i < 10; i++) { MatrixIndexT dim = 5 * i + Rand() % 10; - + Matrix A(dim, dim); Matrix B(dim, dim); TpMatrix C(dim); @@ -1348,7 +1510,7 @@ static void UnitTestCuMatrixAddMatTp() { CuMatrix D(A); CuMatrix E(B); CuTpMatrix F(C); - + A.AddMatTp(1.0, B, kNoTrans, C, kNoTrans, 1.0); D.AddMatTp(1.0, E, kNoTrans, F, kNoTrans, 1.0); @@ -1364,7 +1526,7 @@ static void UnitTestCuMatrixTranspose() { MatrixIndexT dimM = 5 * i + Rand() % 10, dimN = dimM; if (i % 2 == 0) dimN += 5; - + CuMatrix A(dimM, dimN); A.SetRandn(); CuMatrix B(A, kTrans); @@ -1377,7 +1539,7 @@ template static void UnitTestCuMatrixAddTpMat() { for (int32 i = 1; i < 10; i++) { MatrixIndexT dim = 5 * i + Rand() % 10; - + Matrix A(dim, dim); Matrix B(dim, dim); TpMatrix C(dim); @@ -1387,7 +1549,7 @@ static void UnitTestCuMatrixAddTpMat() { CuMatrix D(A); CuMatrix E(B); CuTpMatrix F(C); - + A.AddTpMat(1.0, C, kNoTrans, B, kNoTrans, 1.0); D.AddTpMat(1.0, F, kNoTrans, E, kNoTrans, 1.0); @@ -1399,7 +1561,7 @@ static void UnitTestCuMatrixAddTpMat() { /* * CuVector unit tests */ -template +template static void UnitTestCuVectorAddVec() { Vector Hv(777); Vector Hw(777); @@ -1417,13 +1579,13 @@ static void UnitTestCuVectorAddVec() { Vector Hv2(777); Dv.CopyToVec(&Hv2); - + AssertEqual(Hv,Hv2); } -template +template static void UnitTestCuVectorAddRowSumMat() { const int32 X=4321, Y=19; Real alpha=0.1, beta=0.7; @@ -1440,7 +1602,7 @@ static void UnitTestCuVectorAddRowSumMat() { Dv.CopyFromVec(Hv); Dv.AddRowSumMat(alpha,Dm,beta); - + Hv_accu.SetZero(); Hv_accu.AddRowSumMat(1.0, Hm); Hv.Scale(beta); @@ -1454,7 +1616,7 @@ static void UnitTestCuVectorAddRowSumMat() { -template +template static void UnitTestCuVectorAddRowSumMatLarge() { Matrix Hm(1000,990); Vector Hv(990); @@ -1468,7 +1630,7 @@ static void UnitTestCuVectorAddRowSumMatLarge() { Dv.CopyFromVec(Hv); Dv.AddRowSumMat(0.5,Dm,0.7); - + Hv_accu.SetZero(); Hv_accu.AddRowSumMat(1.0, Hm); Hv.Scale(0.7); @@ -1482,7 +1644,7 @@ static void UnitTestCuVectorAddRowSumMatLarge() { -template +template static void UnitTestCuVectorAddColSumMat() { const int32 X=19, Y=4321; Real alpha=0.5, beta=0.7; @@ -1499,7 +1661,7 @@ static void UnitTestCuVectorAddColSumMat() { Dv.CopyFromVec(Hv); Dv.AddColSumMat(alpha,Dm,beta); - + Hv_accu.SetZero(); Hv_accu.AddColSumMat(1.0, Hm); Hv.Scale(beta); @@ -1511,7 +1673,7 @@ static void UnitTestCuVectorAddColSumMat() { AssertEqual(Hv,Hv2); } -template +template static void UnitTestCuSubMatrix() { for (int32 iter = 0 ; iter < 10; iter++) { int32 M1 = 1 + rand () % 10, M2 = 1 + Rand() % 1, M3 = 1 + Rand() % 10, M = M1 + M2 + M3, @@ -1530,7 +1692,7 @@ static void UnitTestCuSubMatrix() { -template +template static void UnitTestCuVectorAddColSumMatLarge() { Matrix Hm(1000,990); Vector Hv(1000); @@ -1544,7 +1706,7 @@ static void UnitTestCuVectorAddColSumMatLarge() { Dv.CopyFromVec(Hv); Dv.AddColSumMat(0.5, Dm, 0.7); - + Hv_accu.SetZero(); Hv_accu.AddColSumMat(1.0, Hm); Hv.Scale(0.7); @@ -1558,7 +1720,7 @@ static void UnitTestCuVectorAddColSumMatLarge() { -template +template static void UnitTestCuVectorInvertElements() { Vector Hv(777); InitRand(&Hv); @@ -1571,11 +1733,11 @@ static void UnitTestCuVectorInvertElements() { Vector Hv2(777); Dv.CopyToVec(&Hv2); - + AssertEqual(Hv,Hv2); } -template +template static void UnitTestCuMatrixInvertElements() { Matrix Hm(77, 77); InitRand(&Hm); @@ -1588,7 +1750,7 @@ static void UnitTestCuMatrixInvertElements() { Matrix Hm2(77, 77); Dm.CopyToMat(&Hm2); - + AssertEqual(Hm,Hm2); } @@ -1639,7 +1801,7 @@ static void UnitTestCuVectorAddTpVec() { AssertEqual(Hv,Hv2); } -template +template static void UnitTestCuApproxEqual() { Real tol = 0.1; for (int32 i = 0; i < 2; i++) { @@ -1655,7 +1817,7 @@ static void UnitTestCuApproxEqual() { } } -template +template static void UnitTestCuVectorMulTp() { Vector Hv(300); InitRand(&Hv); @@ -1677,7 +1839,7 @@ static void UnitTestCuVectorMulTp() { AssertEqual(Hv,Hv2); } -template +template static void UnitTestCuCopy() { for (int32 i = 0; i < 10; i++) { int32 M = 1 + Rand() % 10, N = 1 + Rand() % 10; @@ -1700,13 +1862,13 @@ static void UnitTestCuCopy() { CuMatrix J(I, kTrans); Matrix K(J, kTrans); CuMatrix L(K, kNoTrans); - + KALDI_ASSERT(A.ApproxEqual(L)); } } -template +template static void UnitTestCuSigmoid() { Matrix Hi(100,111); Matrix Ho(100,111); @@ -1733,7 +1895,7 @@ static void UnitTestCuSigmoid() { -template +template static void UnitTestCuDiffSigmoid() { Matrix Hi(100,111); Matrix Ho(100,111); @@ -1764,7 +1926,7 @@ static void UnitTestCuDiffSigmoid() { -template +template static void UnitTestCuSoftmax() { for (int32 i = 0; i < 2; i++) { @@ -1775,7 +1937,7 @@ static void UnitTestCuSoftmax() { Matrix Ho(row,col); Hi.SetRandn(); Hi.Scale(5.0); - + CuMatrix Di(row, col); CuMatrix Do(row, col); Di.CopyFromMat(Hi); @@ -1795,7 +1957,7 @@ static void UnitTestCuSoftmax() { } -template +template static void UnitTestCuLogSoftmax() { for (int32 i = 0; i < 2; i++) { @@ -1806,7 +1968,7 @@ static void UnitTestCuLogSoftmax() { Matrix Ho(row, col); Hi.SetRandn(); Hi.Scale(5.0); - + CuMatrix Di(row, col); CuMatrix Do(row, col); Di.CopyFromMat(Hi); @@ -1826,7 +1988,7 @@ static void UnitTestCuLogSoftmax() { } -template +template static void UnitTestCuFindRowMaxId() { for (int32 i = 0; i < 2; i++) { int32 dimM = 100 + Rand() % 200, dimN = 100 + Rand() % 200; @@ -1860,7 +2022,7 @@ static void UnitTestCuFindRowMaxId() { -template +template static void UnitTestCuDiffXent() { int32 X=100, Y=111; //nnet output / diff @@ -1878,7 +2040,7 @@ static void UnitTestCuDiffXent() { //logpost vector Vector Hlogpost(X); CuVector Dlogpost(X); - + //gpu Di.DiffXent(Dtgt, &Dlogpost); //cpu @@ -1906,7 +2068,7 @@ template void UnitTestCheck() { CuMatrix Dj(Di); KALDI_LOG << Dj.NumRows(); - + } @@ -1957,7 +2119,7 @@ void UnitTestCuTanh() { H.SetRandn(); CuMatrix D(100,110); D.CopyFromMat(H); - + //gpu CuMatrix Di(100,110); Di.Tanh(D); @@ -1970,7 +2132,7 @@ void UnitTestCuTanh() { AssertEqual(Df,Hf); } -template +template static void UnitTestCuDiffTanh() { Matrix Hi(100,111); Matrix Ho(100,111); @@ -2016,7 +2178,7 @@ static void UnitTestCuMatrixSetRandn() { N.SetRandn(); AssertEqual(M, N); } - + for (int32 i = 0; i < 5; i++) { MatrixIndexT rows = 100 + Rand() % 50, cols = 100 + Rand() % 50; CuMatrix M(rows, cols); @@ -2118,7 +2280,7 @@ static void UnitTestCuMatrixSetZeroAboveDiag() { Matrix A_orig(A); A.SetZeroAboveDiag(); Matrix A_copy(A); - + for (int32 i = 0; i < dim; i++) { for (int32 j = 0; j < dim; j++) { Real aval = A_copy(i, j), aorigval = A_orig(i, j); @@ -2151,14 +2313,14 @@ static void UnitTestCuMatrixCopyUpperToLower() { } -template +template static void UnitTestCuMatrixObjfDeriv() { int32 n_r = 100 + Rand() % 200, n_c = 20 + Rand() % 30; CuMatrix A(n_r, n_c), B(n_r, n_c); B.SetRandn(); B.Add(1.0); B.ApplyFloor(1.0e-10); - + std::vector > labels; for(int i = 0; i < n_r; i++) { for(int j = 0; j < n_c; j++) { @@ -2176,11 +2338,11 @@ static void UnitTestCuMatrixObjfDeriv() { // (sv_labels, logprobs, &tot_objf, &tot_weight) C.CompObjfAndDeriv(labels, B, &a, &b); - + KALDI_ASSERT(ApproxEqual(b, A.Sum())); Real sum2; // sum(i, j) A(i, j) log(B(i, j)); - { + { CuMatrix Bcopy(B); Bcopy.ApplyLog(); sum2 = TraceMatMat(Bcopy, A, kTrans); @@ -2192,43 +2354,58 @@ static void UnitTestCuMatrixObjfDeriv() { KALDI_ASSERT(ApproxEqual(A, C)); } -template +template static void UnitTestCuMatrixAddElements() { for (int32 i = 0; i < 2; i++) { int32 dimM = 100 + Rand() % 50, dimN = 100 + Rand() % 50; // int32 dimM = 256, dimN = 256; CuMatrix H(dimM, dimN); H.SetRandn(); + CuMatrix H_copy(H); CuMatrix M(H); + int32 num_elements = 100 + Rand() % 10; std::vector > input; + std::vector input_index; + Real *input_value = new Real[num_elements]; BaseFloat scale = -1 + (0.33 * (Rand() % 5)); - for (int32 j = 0; j < 100 + Rand() % 10; j++) { + for (int32 j = 0; j < num_elements; j++) { MatrixIndexT r = Rand() % dimM; MatrixIndexT c = Rand() % dimN; + Int32Pair tmp_pair; + tmp_pair.first = r; + tmp_pair.second = c; Real offset = -1 + (0.33 * (Rand() % 5)); M(r, c) += scale * offset; MatrixElement t = {r, c, offset}; input.push_back(t); + input_index.push_back(tmp_pair); + input_value[j] = offset; } H.AddElements(scale, input); + CuArray cu_input_index(input_index); + H_copy.AddElements(scale, cu_input_index, input_value); + delete[] input_value; AssertEqual(H, M); + AssertEqual(H_copy, M); } } -template +template static void UnitTestCuMatrixLookup() { for (int32 i = 0; i < 2; i++) { int32 dimM = 100 + Rand() % 200, dimN = 100 + Rand() % 200; CuMatrix H(dimM, dimN); H.SetRandn(); + int32 num_elements = 10 + Rand() % 10; std::vector indices; std::vector reference; std::vector output; + output.resize(num_elements); // Generates the indices and the reference. - for (int32 j = 0; j < 10 + Rand() % 10; j++) { + for (int32 j = 0; j < num_elements; j++) { MatrixIndexT r = Rand() % dimM; MatrixIndexT c = Rand() % dimN; @@ -2239,13 +2416,13 @@ static void UnitTestCuMatrixLookup() { reference.push_back(H(r, c)); } - H.Lookup(indices, &output); + H.Lookup(indices, &(output[0])); KALDI_ASSERT(reference == output); } } -template +template static void UnitTestCuMatrixEqualElementMask() { CuMatrix m1(10,9), m2(10,9); CuMatrix mask_same, mask_different; @@ -2280,12 +2457,14 @@ template void CudaMatrixUnitTest() { UnitTestCuMatrixScale(); UnitTestCuMatrixSigmoid(); UnitTestCuMatrixSoftHinge(); - UnitTestCuMatrixApplyPow(); - UnitTestCuMatrixApplyPowAbs(); + UnitTestCuMatrixApplyPow(); + UnitTestCuMatrixApplyPowAbs(); UnitTestCuMatrixSet(); UnitTestCuMatrixAdd(); UnitTestCuMatrixApplyFloor(); + UnitTestCuMatrixApplyCeiling(); UnitTestCuMatrixApplyHeaviside(); + UnitTestCuMatrixHeaviside(); UnitTestCuMatrixMulElements(); UnitTestCuMatrixDivElements(); UnitTestCuMatrixMax(); @@ -2293,11 +2472,14 @@ template void CudaMatrixUnitTest() { UnitTestCuMatrixMulRowsVec(); UnitTestCuMatrixDivRowsVec(); UnitTestCuMatrixAddMat(); + UnitTestCuMatrixAddMatBlocks(); UnitTestCuMatrixSum(); UnitTestCuMatrixAddVecToCols(); UnitTestCuMatrixAddVecToRows(); UnitTestCuMatrixAddMatMat(); + UnitTestCuMatrixAddVecVec(); UnitTestCuMatrixSymAddMat2(); + UnitTestCuMatrixAddMatMatBatched(); UnitTestCuMatrixSymInvertPosDef(); UnitTestCuMatrixCopyFromMat(); UnitTestCuMatrixCopyFromTp(); @@ -2318,7 +2500,7 @@ template void CudaMatrixUnitTest() { UnitTestCuMatrixSetZeroAboveDiag(); UnitTestCuMatrixAddElements(); UnitTestCuMatrixLookup(); - UnitTestCuMatrixEqualElementMask(); + UnitTestCuMatrixEqualElementMask(); // test CuVector methods UnitTestCuVectorAddVec(); UnitTestCuVectorAddRowSumMat(); @@ -2331,17 +2513,17 @@ template void CudaMatrixUnitTest() { UnitTestCuMatrixIO(); UnitTestCuSigmoid(); UnitTestCuApproxEqual(); - UnitTestCuCopy(); -#if HAVE_CUDA == 1 + UnitTestCuCopy(); +#if HAVE_CUDA == 1 if (CuDevice::Instantiate().DoublePrecisionSupported()) #endif UnitTestCuCopy(); UnitTestCuMatrixAddToDiag(); UnitTestCuMatrixAdd2(); UnitTestCuDiffSigmoid(); - UnitTestCuMatrixGroupPnorm(); + UnitTestCuMatrixGroupPnorm(); UnitTestCuMatrixGroupPnormDeriv(); - UnitTestCuMatrixGroupMax(); + UnitTestCuMatrixGroupMax(); UnitTestCuMatrixGroupMaxDeriv(); UnitTestCuMatrixMulRowsVec(); UnitTestCuMatrixMulRowsGroupMat(); @@ -2377,7 +2559,7 @@ int main() { kaldi::CudaMatrixUnitTest(); - + #if HAVE_CUDA == 1 if (CuDevice::Instantiate().DoublePrecisionSupported()) { kaldi::CudaMatrixUnitTest(); diff --git a/src/cudamatrix/cu-matrix.cc b/src/cudamatrix/cu-matrix.cc index 03e6f8cfe2c..53f220e0c41 100644 --- a/src/cudamatrix/cu-matrix.cc +++ b/src/cudamatrix/cu-matrix.cc @@ -25,7 +25,7 @@ #if HAVE_CUDA == 1 #include -#include +#include #endif #include "base/timer.h" @@ -46,7 +46,8 @@ namespace kaldi { template void CuMatrix::Resize(MatrixIndexT rows, MatrixIndexT cols, - MatrixResizeType resize_type) { + MatrixResizeType resize_type, + MatrixStrideType stride_type) { // This code does not currently support the other resize_type options. KALDI_ASSERT(resize_type == kSetZero || resize_type == kUndefined); if (rows * cols == 0) KALDI_ASSERT(rows == 0 && cols == 0); @@ -54,28 +55,35 @@ void CuMatrix::Resize(MatrixIndexT rows, MatrixIndexT cols, if (resize_type == kSetZero) this->SetZero(); return; } - if (this->num_rows_ != 0) this->Destroy(); - if (rows == 0) return; + if (rows == 0) return; #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { Timer tim; MatrixIndexT row_bytes = cols * sizeof(Real); size_t pitch; - this->data_ = static_cast(CuDevice::Instantiate().MallocPitch( - row_bytes, rows, &pitch)); - this->num_rows_ = rows; - this->num_cols_ = cols; - this->stride_ = pitch / sizeof(Real); + if (stride_type == kDefaultStride) { + this->data_ = static_cast(CuDevice::Instantiate().MallocPitch( + row_bytes, rows, &pitch)); + this->num_rows_ = rows; + this->num_cols_ = cols; + this->stride_ = pitch / sizeof(Real); + } else { // kStrideEqualNumCols + size_t bytes = rows * cols * sizeof(Real); + this->data_ = static_cast(CuDevice::Instantiate().Malloc(bytes)); + this->num_rows_ = rows; + this->num_cols_ = cols; + this->stride_ = cols; + } if (resize_type == kSetZero) this->SetZero(); - CuDevice::Instantiate().AccuProfile("CuMatrix::Resize", tim.Elapsed()); + CuDevice::Instantiate().AccuProfile("CuMatrix::Resize", tim.Elapsed()); } else #endif { // Let the initializer of Matrix handle the allocation, // and then just do Swap which will switch the pointers. // This wastes a few instructions but is simple to code. - Matrix mat(rows, cols, resize_type); + Matrix mat(rows, cols, resize_type, stride_type); this->Swap(&mat); } } @@ -87,7 +95,7 @@ void CuMatrix::Destroy() { if (this->data_ != NULL) { Timer tim; CuDevice::Instantiate().Free(this->data_); - CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); + CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); } } else #endif @@ -213,7 +221,7 @@ void CuMatrixBase::CopyFromMat(const CuMatrixBase &M, KALDI_ASSERT(M.NumRows() == num_rows_ && M.NumCols() == num_cols_); } else { KALDI_ASSERT(M.NumCols() == num_rows_ && M.NumRows() == num_cols_); - } + } if (M.num_rows_ == 0) return; // Nothing to do. Timer tim; if (sizeof(Real) == sizeof(OtherReal) && trans == kNoTrans ) { @@ -223,15 +231,22 @@ void CuMatrixBase::CopyFromMat(const CuMatrixBase &M, CU_SAFE_CALL(cudaMemcpy2D(data_, dst_pitch, M.data_, src_pitch, width, M.num_rows_, cudaMemcpyDeviceToDevice)); } else { - dim3 dimBlock(CU2DBLOCK, CU2DBLOCK); - // We are making this kernel "newer-style, with x corresponding to - // row dimension and y to column dimension. - dim3 dimGrid(n_blocks(num_rows_, CU2DBLOCK), n_blocks(num_cols_, CU2DBLOCK)); if (trans == kNoTrans) { + dim3 dimGrid, dimBlock; + GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(), + &dimGrid, &dimBlock); cuda_copy_from_mat(dimGrid, dimBlock, data_, M.data_, Dim(), M.Dim()); } else { - cuda_copy_from_mat_trans(dimGrid, dimBlock, data_, M.data_, Dim(), M.Dim()); + // 2D thread block with warps (blockDim.x) along the row-dim of input M. + // Each (8x32) thread block will transpose (32x32) data + const int32 warpSize = 32; + dim3 dimBlock(warpSize, CU1DBLOCK / warpSize); + dim3 dimGrid(n_blocks(M.NumCols(), warpSize), + n_blocks(M.NumRows(), warpSize)); + cuda_copy_from_mat_trans(dimGrid, dimBlock, data_, M.data_, Dim(), + M.Dim()); } + CU_SAFE_CALL(cudaGetLastError()); } CuDevice::Instantiate().AccuProfile("CuMatrixBase::CopyFromMat(from other CuMatrixBase)", tim.Elapsed()); } else @@ -272,9 +287,9 @@ void CuMatrixBase::CopyFromTp(const CuTpMatrix &M, if (trans == kNoTrans) { cuda_copy_from_tp(dimGrid, dimBlock, data_, M.Data(), Dim()); } else { - cuda_copy_from_tp_trans(dimGrid, dimBlock, data_, M.Data(), Dim()); + cuda_copy_from_tp_trans(dimGrid, dimBlock, data_, M.Data(), Dim()); } - CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); + CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); } else #endif { @@ -294,10 +309,10 @@ template void CuMatrixBase::CopyFromTp(const CuTpMatrix &M, template void CuMatrixBase::CopyFromMat(const MatrixBase &src, MatrixTransposeType trans) { -#if HAVE_CUDA == 1 +#if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { if (trans == kNoTrans) { - KALDI_ASSERT(src.NumRows() == num_rows_ && src.NumCols() == num_cols_); + KALDI_ASSERT(src.NumRows() == num_rows_ && src.NumCols() == num_cols_); Timer tim; MatrixIndexT dst_pitch = stride_*sizeof(Real); @@ -305,7 +320,7 @@ void CuMatrixBase::CopyFromMat(const MatrixBase &src, MatrixIndexT width = src.NumCols()*sizeof(Real); CU_SAFE_CALL(cudaMemcpy2D(data_, dst_pitch, src.Data(), src_pitch, width, src.NumRows(), cudaMemcpyHostToDevice)); - + CuDevice::Instantiate().AccuProfile("CuMatrixBase::CopyFromMat(from CPU)",tim.Elapsed()); } else { CuMatrix trans_mat(src); // Do the transpose on the GPU board. @@ -398,8 +413,8 @@ template template void CuMatrixBase::CopyToMat(MatrixBase *dst, MatrixTransposeType trans) const { -#if HAVE_CUDA == 1 - if (CuDevice::Instantiate().Enabled()) { +#if HAVE_CUDA == 1 + if (CuDevice::Instantiate().Enabled()) { if (trans == kTrans || sizeof(OtherReal) != sizeof(Real)) { CuMatrix this_trans(*this, trans); this_trans.CopyToMat(dst, kNoTrans); @@ -407,7 +422,7 @@ void CuMatrixBase::CopyToMat(MatrixBase *dst, KALDI_ASSERT(dst->NumRows() == NumRows() && dst->NumCols() == NumCols()); if (num_rows_ == 0) return; Timer tim; - + MatrixIndexT src_pitch = stride_*sizeof(Real); MatrixIndexT dst_pitch = dst->Stride()*sizeof(Real); MatrixIndexT width = NumCols()*sizeof(Real); @@ -458,10 +473,10 @@ void CuMatrixBase::Write(std::ostream &os, bool binary) const { template void CuMatrixBase::SetZero() { -#if HAVE_CUDA == 1 - if (CuDevice::Instantiate().Enabled()) { +#if HAVE_CUDA == 1 + if (CuDevice::Instantiate().Enabled()) { Timer tim; - CU_SAFE_CALL(cudaMemset2D(data_, stride_ * sizeof(Real), 0, + CU_SAFE_CALL(cudaMemset2D(data_, stride_ * sizeof(Real), 0, num_cols_ * sizeof(Real), num_rows_ )); CuDevice::Instantiate().AccuProfile("CuMatrix::SetZero", tim.Elapsed()); } else @@ -477,15 +492,16 @@ void CuMatrixBase::SetZero() { /* * Methods wrapping the ANSI-C CUDA kernels */ -template +template void CuMatrixBase::Set(Real value) { - #if HAVE_CUDA == 1 + #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { if (num_rows_ == 0) return; Timer tim; - dim3 dimBlock(CU2DBLOCK, CU2DBLOCK); - dim3 dimGrid(n_blocks(NumCols(), CU2DBLOCK), n_blocks(NumRows(), CU2DBLOCK)); + dim3 dimGrid, dimBlock; + GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(), + &dimGrid, &dimBlock); cuda_set_const(dimGrid, dimBlock, data_, value, Dim()); CU_SAFE_CALL(cudaGetLastError()); @@ -506,8 +522,9 @@ void CuMatrixBase::SetZeroAboveDiag() { if (num_rows_ == 0) return; Timer tim; - dim3 dimBlock(CU2DBLOCK, CU2DBLOCK); - dim3 dimGrid(n_blocks(NumCols(), CU2DBLOCK), n_blocks(NumRows(), CU2DBLOCK)); + dim3 dimGrid, dimBlock; + GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(), + &dimGrid, &dimBlock); cuda_set_zero_above_diag(dimGrid, dimBlock, data_, Dim()); CU_SAFE_CALL(cudaGetLastError()); @@ -526,15 +543,16 @@ void CuMatrixBase::SetZeroAboveDiag() { } } -template -void CuMatrixBase::Add(Real value) { -#if HAVE_CUDA == 1 +template +void CuMatrixBase::Add(Real value) { +#if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { if (num_rows_ == 0) return; Timer tim; - dim3 dimBlock(CU2DBLOCK, CU2DBLOCK); - dim3 dimGrid(n_blocks(NumCols(), CU2DBLOCK), n_blocks(NumRows(), CU2DBLOCK)); + dim3 dimGrid, dimBlock; + GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(), + &dimGrid, &dimBlock); cuda_add(dimGrid, dimBlock, data_, value, Dim()); CU_SAFE_CALL(cudaGetLastError()); @@ -547,9 +565,9 @@ void CuMatrixBase::Add(Real value) { } } -template -void CuMatrixBase::AddToDiag(Real value) { -#if HAVE_CUDA == 1 +template +void CuMatrixBase::AddToDiag(Real value) { +#if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { if (num_rows_ == 0) return; Timer tim; @@ -587,15 +605,16 @@ bool CuMatrixBase::IsUnit(Real tol) const { -template -void CuMatrixBase::Scale(Real value) { -#if HAVE_CUDA == 1 +template +void CuMatrixBase::Scale(Real value) { +#if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { if (num_rows_ == 0) return; Timer tim; - dim3 dimBlock(CU2DBLOCK, CU2DBLOCK); - dim3 dimGrid(n_blocks(NumCols(), CU2DBLOCK), n_blocks(NumRows(), CU2DBLOCK)); + dim3 dimGrid, dimBlock; + GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(), + &dimGrid, &dimBlock); cuda_scale(dimGrid, dimBlock, data_, value, Dim()); CU_SAFE_CALL(cudaGetLastError()); @@ -608,15 +627,16 @@ void CuMatrixBase::Scale(Real value) { } } -template -void CuMatrixBase::ApplyLog() { - #if HAVE_CUDA == 1 +template +void CuMatrixBase::ApplyLog() { + #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { if (num_rows_ == 0) return; Timer tim; - dim3 dimBlock(CU2DBLOCK, CU2DBLOCK); - dim3 dimGrid(n_blocks(NumCols(), CU2DBLOCK), n_blocks(NumRows(), CU2DBLOCK)); + dim3 dimGrid, dimBlock; + GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(), + &dimGrid, &dimBlock); cuda_apply_log(dimGrid, dimBlock, data_, Dim()); CU_SAFE_CALL(cudaGetLastError()); @@ -637,13 +657,14 @@ void CuMatrixBase::MulElements(const CuMatrixBase& A) { KALDI_ASSERT(num_cols_ == A.NumCols()); KALDI_ASSERT(num_rows_ == A.NumRows()); - - dim3 dimBlock(CU2DBLOCK, CU2DBLOCK); - dim3 dimGrid(n_blocks(NumCols(), CU2DBLOCK), n_blocks(NumRows(), CU2DBLOCK)); + + dim3 dimGrid, dimBlock; + GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(), + &dimGrid, &dimBlock); cuda_mul_elements(dimGrid, dimBlock, data_, A.data_, Dim(), A.Stride()); CU_SAFE_CALL(cudaGetLastError()); - + CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); } else #endif @@ -660,13 +681,14 @@ void CuMatrixBase::DivElements(const CuMatrixBase& A) { KALDI_ASSERT(num_cols_ == A.NumCols()); KALDI_ASSERT(num_rows_ == A.NumRows()); - - dim3 dimBlock(CU2DBLOCK, CU2DBLOCK); - dim3 dimGrid(n_blocks(NumCols(), CU2DBLOCK), n_blocks(NumRows(), CU2DBLOCK)); + + dim3 dimGrid, dimBlock; + GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(), + &dimGrid, &dimBlock); cuda_div_elements(dimGrid, dimBlock, data_, A.data_, Dim(), A.Stride()); CU_SAFE_CALL(cudaGetLastError()); - + CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); } else #endif @@ -683,13 +705,14 @@ void CuMatrixBase::Max(const CuMatrixBase& A) { KALDI_ASSERT(num_cols_ == A.NumCols()); KALDI_ASSERT(num_rows_ == A.NumRows()); - - dim3 dimBlock(CU2DBLOCK, CU2DBLOCK); - dim3 dimGrid(n_blocks(NumCols(), CU2DBLOCK), n_blocks(NumRows(), CU2DBLOCK)); + + dim3 dimGrid, dimBlock; + GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(), + &dimGrid, &dimBlock); cuda_max(dimGrid, dimBlock, data_, A.data_, Dim(), A.Stride()); CU_SAFE_CALL(cudaGetLastError()); - + CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); } else #endif @@ -701,14 +724,16 @@ void CuMatrixBase::Max(const CuMatrixBase& A) { template void CuMatrixBase::MulColsVec(const CuVectorBase &scale) { -#if HAVE_CUDA == 1 - if (CuDevice::Instantiate().Enabled()) { +#if HAVE_CUDA == 1 + if (CuDevice::Instantiate().Enabled()) { Timer tim; KALDI_ASSERT(scale.Dim() == NumCols()); - dim3 dimBlock(CU2DBLOCK, CU2DBLOCK); - dim3 dimGrid(n_blocks(NumCols(), CU2DBLOCK), n_blocks(NumRows(), CU2DBLOCK)); + + dim3 dimGrid, dimBlock; + GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(), + &dimGrid, &dimBlock); cuda_mul_cols_vec(dimGrid, dimBlock, data_, scale.data_, Dim()); CU_SAFE_CALL(cudaGetLastError()); @@ -732,31 +757,33 @@ void CuMatrixBase::MulRowsVec(const CuVectorBase &scale) { KALDI_ASSERT(scale.Dim() == NumRows()); - dim3 dimBlock(CU2DBLOCK, CU2DBLOCK); - dim3 dimGrid(n_blocks(NumCols(), CU2DBLOCK), n_blocks(NumRows(), CU2DBLOCK)); + dim3 dimGrid, dimBlock; + GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(), + &dimGrid, &dimBlock); cuda_mul_rows_vec(dimGrid, dimBlock, data_, scale.data_, Dim()); CU_SAFE_CALL(cudaGetLastError()); CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); - } else + } else #endif { Mat().MulRowsVec(scale.Vec()); } } -template +template void CuMatrixBase::MulRowsGroupMat(const CuMatrixBase &src) { KALDI_ASSERT(src.NumCols() > 0); -#if HAVE_CUDA == 1 - if (CuDevice::Instantiate().Enabled()) { +#if HAVE_CUDA == 1 + if (CuDevice::Instantiate().Enabled()) { Timer tim; int group_size = this->NumCols() / src.NumCols(); - dim3 dimBlock(CU2DBLOCK, CU2DBLOCK); - dim3 dimGrid(n_blocks(NumCols(), CU2DBLOCK), - n_blocks(NumRows(), CU2DBLOCK)); + + dim3 dimGrid, dimBlock; + GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(), + &dimGrid, &dimBlock); cuda_mul_rows_group_mat(dimGrid, dimBlock, this->data_, src.data_, this->Dim(), src.Stride(), group_size); @@ -776,14 +803,14 @@ void CuMatrixBase::GroupPnormDeriv(const CuMatrixBase &src1, KALDI_ASSERT(src2.NumCols() > 0); int group_size = this->NumCols() / src2.NumCols(); KALDI_ASSERT(this->NumCols() == src2.NumCols() * group_size); -#if HAVE_CUDA == 1 - if (CuDevice::Instantiate().Enabled()) { +#if HAVE_CUDA == 1 + if (CuDevice::Instantiate().Enabled()) { Timer tim; - dim3 dimBlock(CU2DBLOCK, CU2DBLOCK); - dim3 dimGrid(n_blocks(NumCols(), CU2DBLOCK), n_blocks(NumRows(), CU2DBLOCK)); - - cuda_calc_pnorm_deriv(dimGrid, dimBlock, this->data_, src1.Data(), src2.Data(), Dim(), src2.Stride(), group_size, power); + dim3 dimGrid(n_blocks(NumCols(), CU2DBLOCK), + n_blocks(NumRows(), CU2DBLOCK)); + cuda_calc_pnorm_deriv(dimGrid, dimBlock, this->data_, src1.Data(), + src2.Data(), Dim(), src2.Stride(), group_size, power); CU_SAFE_CALL(cudaGetLastError()); CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); @@ -802,11 +829,10 @@ void CuMatrixBase::GroupMaxDeriv(const CuMatrixBase &src1, KALDI_ASSERT(this->NumCols() == src2.NumCols() * group_size); #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { - Timer tim; - + Timer tim; dim3 dimBlock(CU2DBLOCK, CU2DBLOCK); - dim3 dimGrid(n_blocks(NumCols(), CU2DBLOCK), n_blocks(NumRows(), CU2DBLOCK)); - + dim3 dimGrid(n_blocks(NumCols(), CU2DBLOCK), + n_blocks(NumRows(), CU2DBLOCK)); cuda_calc_group_max_deriv(dimGrid, dimBlock, this->data_, src1.Data(), src2.Data(), Dim(), src2.Stride(), group_size); @@ -828,14 +854,15 @@ void CuMatrixBase::DivRowsVec(const CuVectorBase &div) { KALDI_ASSERT(div.Dim() == NumRows()); - dim3 dimBlock(CU2DBLOCK, CU2DBLOCK); - dim3 dimGrid(n_blocks(NumCols(), CU2DBLOCK), n_blocks(NumRows(), CU2DBLOCK)); + dim3 dimGrid, dimBlock; + GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(), + &dimGrid, &dimBlock); cuda_div_rows_vec(dimGrid, dimBlock, data_, div.data_, Dim()); CU_SAFE_CALL(cudaGetLastError()); CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); - } else + } else #endif { Vector temp(div.Vec()); // will copy. @@ -843,17 +870,18 @@ void CuMatrixBase::DivRowsVec(const CuVectorBase &div) { Mat().MulRowsVec(temp); } } - + template void CuMatrixBase::InvertElements() { #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { Timer tim; - dim3 dimBlock(CU2DBLOCK, CU2DBLOCK); - dim3 dimGrid(n_blocks(NumCols(), CU2DBLOCK), n_blocks(NumRows(), CU2DBLOCK)); + dim3 dimGrid, dimBlock; + GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(), + &dimGrid, &dimBlock); - cuda_invert_elements(dimGrid, dimBlock, data_, Dim()); + cuda_invert_elements(dimGrid, dimBlock, data_, Dim()); CU_SAFE_CALL(cudaGetLastError()); CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); @@ -866,7 +894,7 @@ void CuMatrixBase::InvertElements() { template -void CuMatrixBase::AddMat(Real alpha, const CuMatrixBase& A, +void CuMatrixBase::AddMat(Real alpha, const CuMatrixBase& A, MatrixTransposeType transA) { #if HAVE_CUDA == 1 @@ -878,10 +906,14 @@ void CuMatrixBase::AddMat(Real alpha, const CuMatrixBase& A, } if (num_rows_ == 0) return; Timer tim; + // This block dimension seems to work better than the + // one from GetBlockSizesForSimpleMatrixOperation(). dim3 dimBlock(CU2DBLOCK, CU2DBLOCK); - dim3 dimGrid(n_blocks(NumCols(), CU2DBLOCK), n_blocks(NumRows(), CU2DBLOCK)); - cuda_add_mat(dimGrid, dimBlock, alpha, A.data_, data_, Dim(), A.Stride(), - (transA == kTrans ? 1 : 0)); + dim3 dimGrid(n_blocks(NumCols(), CU2DBLOCK), + n_blocks(NumRows(), CU2DBLOCK)); + cuda_add_mat(dimGrid, dimBlock, alpha, A.data_, + data_, Dim(), A.Stride(), + (transA == kTrans ? 1 : 0)); CU_SAFE_CALL(cudaGetLastError()); CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); @@ -893,7 +925,53 @@ void CuMatrixBase::AddMat(Real alpha, const CuMatrixBase& A, } template -void CuMatrixBase::AddMatMatDivMat(const CuMatrixBase &A, +void CuMatrixBase::AddMatBlocks(Real alpha, const CuMatrixBase &A, + MatrixTransposeType transA) { + if (num_rows_ == 0 || num_cols_ == 0) return; + int32 num_row_blocks, num_col_blocks; + if (transA == kNoTrans) { + KALDI_ASSERT(A.NumRows() % num_rows_ == 0 && A.NumCols() % num_cols_ == 0); + num_row_blocks = A.Mat().NumRows() / num_rows_; + num_col_blocks = A.Mat().NumCols() / num_cols_; + } else { + KALDI_ASSERT(A.NumRows() % num_cols_ == 0 && A.NumCols() % num_rows_ == 0); + num_row_blocks = A.Mat().NumRows() / num_cols_; + num_col_blocks = A.Mat().NumCols() / num_rows_; + } +#if HAVE_CUDA == 1 + if (CuDevice::Instantiate().Enabled()) { + Timer tim; + dim3 dimGrid, dimBlock; + GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(), + &dimGrid, &dimBlock); + cuda_add_mat_blocks(dimGrid, dimBlock, alpha, A.data_, num_row_blocks, + num_col_blocks, data_, Dim(), A.Stride(), + (transA == kTrans ? 1 : 0)); + CU_SAFE_CALL(cudaGetLastError()); + + CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); + } else +#endif + { + int32 nr, nc; + if (transA == kNoTrans) { + nr = num_rows_; + nc = num_cols_; + } else { + nr = num_cols_; + nc = num_rows_; + } + for (int32 i = 0; i < num_row_blocks; i++) { + for (int32 j = 0; j < num_col_blocks; j++) { + Mat().AddMat(alpha, SubMatrix(A.Mat(), i * nr, nr, j * nc, nc), + transA); + } + } + } +} + +template +void CuMatrixBase::AddMatMatDivMat(const CuMatrixBase &A, const CuMatrixBase &B, const CuMatrixBase &C) { #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { @@ -903,11 +981,11 @@ void CuMatrixBase::AddMatMatDivMat(const CuMatrixBase &A, KALDI_ASSERT(num_rows_ == B.num_rows_ && num_cols_ == B.num_cols_); KALDI_ASSERT(num_rows_ == C.num_rows_ && num_cols_ == C.num_cols_); if (num_rows_ == 0) return; - - dim3 dimBlock(CU2DBLOCK, CU2DBLOCK); - dim3 dimGrid(n_blocks(NumCols(), CU2DBLOCK), n_blocks(NumRows(), CU2DBLOCK)); - - cuda_add_mat_mat_div_mat(dimGrid, dimBlock, A.data_, B.data_, C.data_, data_, Dim(), A.Stride(), B.Stride(), C.Stride()); + dim3 dimGrid, dimBlock; + GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(), + &dimGrid, &dimBlock); + cuda_add_mat_mat_div_mat(dimGrid, dimBlock, A.data_, B.data_, C.data_, + data_, Dim(), A.Stride(), B.Stride(), C.Stride()); CU_SAFE_CALL(cudaGetLastError()); CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); @@ -921,7 +999,7 @@ void CuMatrixBase::AddMatMatDivMat(const CuMatrixBase &A, template void CuMatrixBase::AddVecToCols(Real alpha, const CuVectorBase &col, - Real beta) { + Real beta) { if (col.Dim() != NumRows()) { KALDI_ERR << "Non matching dimensions: Rows:" << NumRows() << " VectorDim:" << col.Dim(); } @@ -929,13 +1007,13 @@ void CuMatrixBase::AddVecToCols(Real alpha, #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { Timer tim; - - dim3 dimBlock(CU2DBLOCK, CU2DBLOCK); - dim3 dimGrid(n_blocks(NumCols(), CU2DBLOCK), n_blocks(NumRows(), CU2DBLOCK)); - - cuda_add_vec_to_cols(dimGrid, dimBlock, alpha, col.data_, beta, data_, Dim()); + dim3 dimGrid, dimBlock; + GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(), + &dimGrid, &dimBlock); + cuda_add_vec_to_cols(dimGrid, dimBlock, alpha, col.data_, beta, + data_, Dim()); CU_SAFE_CALL(cudaGetLastError()); - + CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); } else #endif @@ -950,20 +1028,19 @@ void CuMatrixBase::AddVecToCols(Real alpha, template void CuMatrixBase::AddVecToRows(Real alpha, const CuVectorBase &row, - Real beta) { + Real beta) { if (row.Dim() != NumCols()) { KALDI_ERR << "Non matching dimensions: Cols:" << NumCols() << " VectorDim:" << row.Dim(); } #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { Timer tim; - - dim3 dimBlock(CU2DBLOCK, CU2DBLOCK); - dim3 dimGrid(n_blocks(NumCols(), CU2DBLOCK), n_blocks(NumRows(), CU2DBLOCK)); - + dim3 dimGrid, dimBlock; + GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(), + &dimGrid, &dimBlock); cuda_add_vec_to_rows(dimGrid, dimBlock, alpha, row.data_, beta, data_, Dim()); CU_SAFE_CALL(cudaGetLastError()); - + CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); } else #endif @@ -986,7 +1063,7 @@ void CuMatrixBase::AddMatMat( // CUBLAS is col-major, cudamatrix is row-major, how to do the mapping? // keep trans..., just swap A&B matrices: A->B B->A - MatrixIndexT m = ((transB==kTrans)? B.NumRows() : B.NumCols()); + MatrixIndexT m = ((transB==kTrans)? B.NumRows() : B.NumCols()); MatrixIndexT n = ((transA==kTrans)? A.NumCols() : A.NumRows()); MatrixIndexT k = ((transB==kTrans)? B.NumCols() : B.NumRows()); MatrixIndexT k1 = ((transA==kTrans)? A.NumRows() : A.NumCols()); @@ -996,17 +1073,16 @@ void CuMatrixBase::AddMatMat( KALDI_ASSERT(k == k1); if (m == 0) return; - - + + #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { Timer tim; - - cublas_gemm((transB==kTrans?'T':'N'), (transA==kTrans?'T':'N'), m, n, k, - alpha, B.data_, B.Stride(), A.data_, A.Stride(), - beta, data_, Stride()); - - CU_SAFE_CALL(cublasGetError()); + CU_SAFE_CALL(cublas_gemm(GetCublasHandle(), + (transB==kTrans? CUBLAS_OP_T:CUBLAS_OP_N), + (transA==kTrans? CUBLAS_OP_T:CUBLAS_OP_N), + m, n, k, alpha, B.data_, B.Stride(), + A.data_, A.Stride(), beta, data_, Stride())); CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); } else @@ -1017,6 +1093,29 @@ void CuMatrixBase::AddMatMat( } +template +void CuMatrixBase::AddVecVec( + Real alpha, const CuVectorBase &x, const CuVectorBase &y) { + + MatrixIndexT m = y.Dim(); + MatrixIndexT n = x.Dim(); + KALDI_ASSERT(m == NumCols()); + KALDI_ASSERT(n == NumRows()); + +#if HAVE_CUDA == 1 + if (CuDevice::Instantiate().Enabled()) { + Timer tim; + CU_SAFE_CALL(cublas_ger(GetCublasHandle(), m, n, alpha, + y.Data(), 1, x.Data(), 1, data_, Stride())); + + CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); + } else +#endif + { + Mat().AddVecVec(alpha, x.Vec(), y.Vec()); + } +} + template void CuMatrixBase::SymAddMat2( @@ -1031,13 +1130,11 @@ void CuMatrixBase::SymAddMat2( #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { Timer tim; - char trans = (transA == kTrans ? 'N' : 'T'); + cublasOperation_t trans = (transA == kTrans ? CUBLAS_OP_N : CUBLAS_OP_T); MatrixIndexT A_other_dim = (transA == kNoTrans ? A.num_cols_ : A.num_rows_); - - cublas_syrk('U', trans, num_rows_, A_other_dim, alpha, A.Data(), - A.Stride(), beta, this->data_, this->stride_); - - CU_SAFE_CALL(cublasGetError()); + CU_SAFE_CALL(cublas_syrk(GetCublasHandle(), CUBLAS_FILL_MODE_UPPER, trans, + num_rows_, A_other_dim, alpha, A.Data(), + A.Stride(), beta, this->data_, this->stride_)); CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); } else @@ -1064,10 +1161,8 @@ void CuMatrixBase::AddDiagVecMat( Timer tim; dim3 dimBlock(CU2DBLOCK, CU2DBLOCK); - dim3 dimGrid(n_blocks(num_cols_, CU2DBLOCK), n_blocks(num_rows_, CU2DBLOCK)); - MatrixIndexT M_row_stride = M.Stride(), M_col_stride = 1; if (transM == kTrans) std::swap(M_row_stride, M_col_stride); @@ -1080,12 +1175,12 @@ void CuMatrixBase::AddDiagVecMat( { Mat().AddDiagVecMat(alpha, v.Vec(), M.Mat(), transM, beta); } -} +} template void CuMatrixBase::AddMatDiagVec( - const Real alpha, + const Real alpha, const CuMatrixBase &M, MatrixTransposeType transM, CuVectorBase &v, Real beta) { @@ -1099,14 +1194,11 @@ void CuMatrixBase::AddMatDiagVec( KALDI_ASSERT(v.Dim() == this->NumCols()); Timer tim; - dim3 dimBlock(CU2DBLOCK, CU2DBLOCK); - // Caution, this dimGrid is not the same way around as much of the other - // code: going forward, I want to use the (rows, cols) order. - dim3 dimGrid(n_blocks(num_rows_, CU2DBLOCK), n_blocks(num_cols_, CU2DBLOCK)); - + dim3 dimGrid, dimBlock; + GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(), + &dimGrid, &dimBlock); MatrixIndexT M_row_stride = M.Stride(), M_col_stride = 1; if (transM == kTrans) std::swap(M_row_stride, M_col_stride); - cuda_add_mat_diag_vec(dimGrid, dimBlock, alpha, data_, Dim(), M.Data(), M_row_stride, M_col_stride, v.Data(), beta); CU_SAFE_CALL(cudaGetLastError()); @@ -1119,14 +1211,16 @@ void CuMatrixBase::AddMatDiagVec( } template -void CuMatrixBase::AddMatMatElements(Real alpha, +void CuMatrixBase::AddMatMatElements(Real alpha, const CuMatrixBase &A, const CuMatrixBase &B, Real beta) { #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { Timer tim; - dim3 dimBlock(CU2DBLOCK, CU2DBLOCK); - dim3 dimGrid(n_blocks(NumCols(), CU2DBLOCK), n_blocks(NumRows(), CU2DBLOCK)); - cuda_add_mat_mat_elements(dimGrid, dimBlock, this->data_, A.Data(), B.Data(), Dim(), A.Stride(), B.Stride(), alpha, beta); + dim3 dimGrid, dimBlock; + GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(), + &dimGrid, &dimBlock); + cuda_add_mat_mat_elements(dimGrid, dimBlock, this->data_, A.Data(), + B.Data(), Dim(), A.Stride(), B.Stride(), alpha, beta); CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); } else #endif @@ -1139,16 +1233,16 @@ void CuMatrixBase::AddMatMatElements(Real alpha, template void CuMatrixBase::Sigmoid(const CuMatrixBase &src) { KALDI_ASSERT(SameDim(*this, src)); -#if HAVE_CUDA == 1 +#if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { Timer tim; - - dim3 dimBlock(CU2DBLOCK, CU2DBLOCK); - dim3 dimGrid(n_blocks(src.NumCols(), CU2DBLOCK), n_blocks(src.NumRows(), CU2DBLOCK)); - - cuda_sigmoid(dimGrid, dimBlock, this->data_, src.data_, this->Dim(), src.Stride()); + dim3 dimGrid, dimBlock; + GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(), + &dimGrid, &dimBlock); + cuda_sigmoid(dimGrid, dimBlock, this->data_, src.data_, this->Dim(), + src.Stride()); CU_SAFE_CALL(cudaGetLastError()); - + CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); } else #endif @@ -1160,16 +1254,16 @@ void CuMatrixBase::Sigmoid(const CuMatrixBase &src) { template void CuMatrixBase::SoftHinge(const CuMatrixBase &src) { KALDI_ASSERT(SameDim(*this, src)); -#if HAVE_CUDA == 1 - if (CuDevice::Instantiate().Enabled()) { +#if HAVE_CUDA == 1 + if (CuDevice::Instantiate().Enabled()) { Timer tim; - - dim3 dimBlock(CU2DBLOCK, CU2DBLOCK); - dim3 dimGrid(n_blocks(src.NumCols(), CU2DBLOCK), n_blocks(src.NumRows(), CU2DBLOCK)); - - cuda_soft_hinge(dimGrid, dimBlock, this->data_, src.data_, this->Dim(), src.Stride()); + dim3 dimGrid, dimBlock; + GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(), + &dimGrid, &dimBlock); + cuda_soft_hinge(dimGrid, dimBlock, this->data_, src.data_, this->Dim(), + src.Stride()); CU_SAFE_CALL(cudaGetLastError()); - + CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); } else #endif @@ -1184,11 +1278,13 @@ void CuMatrixBase::GroupPnorm(const CuMatrixBase &src, Real power) { KALDI_ASSERT(src.NumCols() == this->NumCols() * group_size && this->NumRows() == src.NumRows()); #if HAVE_CUDA == 1 - if (CuDevice::Instantiate().Enabled()) { + if (CuDevice::Instantiate().Enabled()) { Timer tim; dim3 dimBlock(CU2DBLOCK, CU2DBLOCK); - dim3 dimGrid(n_blocks(src.NumCols(), CU2DBLOCK), n_blocks(src.NumRows(), CU2DBLOCK)); - cuda_group_pnorm(dimGrid, dimBlock, this->data_, src.data_, this->Dim(), src.Stride(), group_size, power); + dim3 dimGrid(n_blocks(NumCols(), CU2DBLOCK), + n_blocks(NumRows(), CU2DBLOCK)); + cuda_group_pnorm(dimGrid, dimBlock, this->data_, src.data_, this->Dim(), + src.Stride(), group_size, power); CU_SAFE_CALL(cudaGetLastError()); CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); } else @@ -1207,7 +1303,8 @@ void CuMatrixBase::GroupMax(const CuMatrixBase &src) { if (CuDevice::Instantiate().Enabled()) { Timer tim; dim3 dimBlock(CU2DBLOCK, CU2DBLOCK); - dim3 dimGrid(n_blocks(src.NumCols(), CU2DBLOCK), n_blocks(src.NumRows(), CU2DBLOCK)); + dim3 dimGrid(n_blocks(NumCols(), CU2DBLOCK), + n_blocks(NumRows(), CU2DBLOCK)); cuda_group_max(dimGrid, dimBlock, this->data_, src.data_, this->Dim(), src.Stride(), group_size); CU_SAFE_CALL(cudaGetLastError()); @@ -1241,7 +1338,7 @@ void CuMatrix::CompObjfAndDeriv(const std::vector >& s iter->column < num_cols && iter->column >= 0); } } - + #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { if (sv_labels.empty()) { @@ -1277,7 +1374,7 @@ void CuMatrix::CompObjfAndDeriv(const std::vector >& s KALDI_ASSERT(this_prob >= 0.99e-20); // we floored to 1.0e-20 in SoftmaxLayer. *tot_objf += weight * Log(this_prob); *tot_weight += weight; - (*this)(m, label) += weight / this_prob; + (*this)(m, label) += weight / this_prob; } } } @@ -1285,7 +1382,7 @@ void CuMatrix::CompObjfAndDeriv(const std::vector >& s template // Y->this, X->src void CuMatrixBase::ApplySoftMaxPerRow(const CuMatrixBase &src) { KALDI_ASSERT(SameDim(*this, src)); -#if HAVE_CUDA == 1 +#if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { Timer tim; size_t dimBlock = src.num_cols_ > CU1DBLOCK ? CU1DBLOCK : src.num_cols_; @@ -1308,7 +1405,7 @@ void CuMatrixBase::ApplySoftMaxPerRow(const CuMatrixBase &src) { template // Y->this, X->src void CuMatrixBase::ApplyLogSoftMaxPerRow(const CuMatrixBase &src) { KALDI_ASSERT(SameDim(*this, src)); -#if HAVE_CUDA == 1 +#if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { Timer tim; size_t dimBlock = src.num_cols_ > CU1DBLOCK ? CU1DBLOCK : src.num_cols_; @@ -1334,13 +1431,12 @@ template // Eout -> *this, Ein -> diff, Y -> value void CuMatrixBase::DiffSigmoid(const CuMatrixBase &value, const CuMatrixBase &diff) { KALDI_ASSERT(SameDim(*this, value) && SameDim(*this, diff)); -#if HAVE_CUDA == 1 - if (CuDevice::Instantiate().Enabled()) { +#if HAVE_CUDA == 1 + if (CuDevice::Instantiate().Enabled()) { Timer tim; - - dim3 dimBlock(CU2DBLOCK, CU2DBLOCK); - dim3 dimGrid(n_blocks(num_cols_, CU2DBLOCK), n_blocks(num_rows_, CU2DBLOCK)); - + dim3 dimGrid, dimBlock; + GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(), + &dimGrid, &dimBlock); cuda_diff_sigmoid(dimGrid, dimBlock, data_, diff.data_, value.data_, Dim(), diff.Stride(), value.Stride()); CU_SAFE_CALL(cudaGetLastError()); @@ -1352,20 +1448,20 @@ void CuMatrixBase::DiffSigmoid(const CuMatrixBase &value, } } - + template void CuMatrixBase::Tanh(const CuMatrixBase &src) { KALDI_ASSERT(SameDim(*this, src)); -#if HAVE_CUDA == 1 - if (CuDevice::Instantiate().Enabled()) { +#if HAVE_CUDA == 1 + if (CuDevice::Instantiate().Enabled()) { Timer tim; - - dim3 dimBlock(CU2DBLOCK, CU2DBLOCK); - dim3 dimGrid(n_blocks(src.NumCols(), CU2DBLOCK), n_blocks(src.NumRows(), CU2DBLOCK)); + dim3 dimGrid, dimBlock; + GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(), + &dimGrid, &dimBlock); cuda_tanh(dimGrid, dimBlock, this->data_, src.data_, this->Dim(), src.Stride()); CU_SAFE_CALL(cudaGetLastError()); - + CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); } else #endif @@ -1379,13 +1475,12 @@ void CuMatrixBase::Tanh(const CuMatrixBase &src) { template // Ein -> diff, Y -> value void CuMatrixBase::DiffTanh(const CuMatrixBase &value, const CuMatrixBase &diff) { -#if HAVE_CUDA == 1 - if (CuDevice::Instantiate().Enabled()) { +#if HAVE_CUDA == 1 + if (CuDevice::Instantiate().Enabled()) { Timer tim; - - dim3 dimBlock(CU2DBLOCK, CU2DBLOCK); - dim3 dimGrid(n_blocks(num_cols_, CU2DBLOCK), n_blocks(num_rows_, CU2DBLOCK)); - + dim3 dimGrid, dimBlock; + GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(), + &dimGrid, &dimBlock); cuda_diff_tanh(dimGrid, dimBlock, data_, diff.data_, value.data_, Dim(), diff.Stride(), value.Stride()); CU_SAFE_CALL(cudaGetLastError()); @@ -1399,39 +1494,18 @@ void CuMatrixBase::DiffTanh(const CuMatrixBase &value, template void CuMatrixBase::FindRowMaxId(CuArray *id) const { -#if HAVE_CUDA == 1 +#if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { Timer tim; - - // initialize the vectors - CuVector max(num_rows_); - max.Set(-1e21); id->Resize(num_rows_); - id->Set(-1); + MatrixDim d = Dim(); - MatrixDim d=Dim(); // only stride will be used! - - // process per 256 column blocks - for (int32 block = 0; (block+1)*256 <= num_cols_; block++) { - dim3 dimBlock(CU1DBLOCK, 1); - dim3 dimGrid(1, num_rows_); - int32 offset = block*CU1DBLOCK; + // CUDA thread layout: one thread block per matrix-row. + dim3 dimBlock(CU1DBLOCK); + dim3 dimGrid(num_rows_); + cuda_find_row_max_id(dimGrid, dimBlock, data_, NULL, id->Data(), d); + CU_SAFE_CALL(cudaGetLastError()); - cuda_find_row_max_id(dimGrid, dimBlock, data_ + offset, - max.data_, id->Data(), offset, d); - } - - // process the remainder - int32 div = num_cols_ / 256; - int32 mod = num_cols_ % 256; - if (mod != 0) { - dim3 dimBlock(mod, 1); - dim3 dimGrid(1, num_rows_); - int32 offset=div*256; - - cuda_find_row_max_id(dimGrid, dimBlock, data_ + offset, - max.data_, id->Data(), offset, d); - } // now we have the indices! CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); } else @@ -1442,11 +1516,11 @@ void CuMatrixBase::FindRowMaxId(CuArray *id) const { id->Set(-1); // find maxima MatrixIndexT num_rows = num_rows_, num_cols = num_cols_; - for(MatrixIndexT r = 0; r < num_rows; r++) { + for (MatrixIndexT r = 0; r < num_rows; r++) { Real max = -1e21; int32 max_id = -1; const Real *row_data = Mat().RowData(r); - for(MatrixIndexT c = 0; c < num_cols; c++) { + for (MatrixIndexT c = 0; c < num_cols; c++) { if (max < row_data[c]) { max = row_data[c]; max_id = c; @@ -1460,14 +1534,13 @@ void CuMatrixBase::FindRowMaxId(CuArray *id) const { template void CuMatrixBase::DiffXent(const CuArray &tgt, CuVector *log_post_tgt) { - + KALDI_ASSERT(tgt.Dim() == num_rows_); log_post_tgt->Resize(tgt.Dim()); -#if HAVE_CUDA == 1 +#if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { Timer tim; - dim3 dimBlock(1, CU2DBLOCK*8); dim3 dimGrid(1, n_blocks(tgt.Dim(), CU2DBLOCK*8)); cuda_diff_xent(dimGrid, dimBlock, tgt.Data(), data_, @@ -1527,16 +1600,16 @@ void CuMatrixBase::Cholesky(CuMatrixBase *inv_cholesky) { return; } // At this point, if none of the other cases apply, we recurse. - + // The selection of dim1 is a heuristic. We could also just take half. int32 tot_dim = this->NumRows(); int32 dim1; // Break it up into a whole number of blocks, for better memory alignment. // The line below, setting dim1 can be decided on a heuristic basis: from - // the point of view of correctness, it can really be any value + // the point of view of correctness, it can really be any value // 0 < dim1 < tot_dim. dim1 = block_size * std::max(1, tot_dim / (2 * block_size)); - + int32 dim2 = tot_dim - dim1; CuSubMatrix this_11(*this, 0, dim1, 0, dim1), this_12(*this, 0, dim1, dim1, dim2), @@ -1567,7 +1640,7 @@ void CuMatrixBase::Cholesky(CuMatrixBase *inv_cholesky) { L21 = A21 inv(L11') = A21 M11' We can compute L22 and M22 recursively by doing Cholesky (and computing the inverse Cholesky) on the quantity T = (A22 - L21 L21'). [we give it the name T just for easy reference.] - + Computationally, we do this as follows: (1) Recurse to get L11 and M11. (2) Compute L21 = A21 M11' @@ -1607,7 +1680,7 @@ void CuMatrixBase::Cholesky(CuMatrixBase *inv_cholesky) { // (5)(d) zero L12 and M12. this_12.SetZero(); inv_12.SetZero(); -} +} @@ -1617,13 +1690,13 @@ void CuMatrixBase::SymInvertPosDef() { if (num_rows_ == 0) return; #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { - Timer tim; + Timer tim; CuMatrix inv_cholesky(num_rows_, num_rows_); this->Cholesky(&inv_cholesky); // note: SymAddMat2 only updates lower part of *this. this->SymAddMat2(1.0, inv_cholesky, kTrans, 0.0); this->CopyLowerToUpper(); - CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); + CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); } else #endif { @@ -1661,31 +1734,35 @@ Real TraceMatMat(const CuMatrixBase &A, } else { KALDI_ASSERT(A.NumRows() == B.NumRows() && A.NumCols() == B.NumCols()); } - if (A.NumRows() * A.NumCols() > 16384) { - // This version in which we don't use a special-purpose kernel, but - // do AddDiagMat on a temporary vector and returns its sum, seems to be - // faster for larger matrices. The cutoff is approximate and - // we only looked at the time on square matrices, which - // is what we test in cu-matrix-speed-test.cc. - CuVector sum_vec(A.NumRows()); - sum_vec.AddDiagMatMat(1.0, A, kNoTrans, - B, trans, 0.0); - return sum_vec.Sum(); - } else { - Timer tim; - // the sizes of result_vec must match what we - // call the kernels with, in cu-kernels.cu - CuVector result_vec(trans == kTrans ? 4 : 2, kUndefined); - if (trans == kNoTrans) { - cuda_trace_mat_mat(A.Data(), B.Data(), A.Dim(), B.Stride(), result_vec.Data()); - } else { - cuda_trace_mat_mat_trans(A.Data(), B.Data(), A.Dim(), B.Stride(), result_vec.Data()); + Timer tim; + // 2D blocks: each (8x32) block sums up (32x32) elements. + // 2D grid: try to cover all the matrix A unless it is too big. + // Kernel will reduce to ~256 elements with good performance, + // if the matrix is not in a very bad shape. + // (wider or taller than 32x8192) + // CPU will then reduce to 1 element. + const int kWarpSize = 32; + dim3 dimBlock(kWarpSize, CU1DBLOCK / kWarpSize); + dim3 dimGrid(n_blocks(A.NumCols(), kWarpSize), + n_blocks(A.NumRows(), kWarpSize)); + if (dimGrid.x * dimGrid.y > 256) { + dimGrid.y = 256 / dimGrid.x; + if (dimGrid.y == 0) { + dimGrid.y = 1; } - CU_SAFE_CALL(cudaGetLastError()); - Vector result_cpu(result_vec); // copying from CUDA faster than summing in CUDA. - result = result_cpu.Sum(); - CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); } + CuVector result_vec(dimGrid.x * dimGrid.y, kUndefined); + if (trans == kNoTrans) { + cuda_trace_mat_mat(dimGrid, dimBlock, A.Data(), B.Data(), A.Dim(), + B.Stride(), result_vec.Data()); + } else { + cuda_trace_mat_mat_trans(dimGrid, dimBlock, A.Data(), B.Data(), A.Dim(), + B.Stride(), result_vec.Data()); + } + CU_SAFE_CALL(cudaGetLastError()); + Vector result_cpu(result_vec); // copying from CUDA faster than summing in CUDA. + result = result_cpu.Sum(); + CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); } else #endif { @@ -1703,6 +1780,93 @@ double TraceMatMat(const CuMatrixBase &A, const CuMatrixBase &B, MatrixTransposeType trans); +template +void AddMatMatBatched(const Real alpha, std::vector* > &C, + const std::vector* > &A, MatrixTransposeType transA, + const std::vector* > &B, MatrixTransposeType transB, + const Real beta) { + KALDI_ASSERT(A.size() == B.size() && B.size() == C.size()); + int32 size = A.size(); + + if (size == 0) return; + + // all elements must have the same num-rows, num-cols and stride + for (int32 i = 0; i + 1 < size; i++) { + KALDI_ASSERT(A[i]->NumRows() == A[i+1]->NumRows()); + KALDI_ASSERT(A[i]->NumCols() == A[i+1]->NumCols()); + KALDI_ASSERT(A[i]->Stride() == A[i+1]->Stride()); + KALDI_ASSERT(B[i]->NumRows() == B[i+1]->NumRows()); + KALDI_ASSERT(B[i]->NumCols() == B[i+1]->NumCols()); + KALDI_ASSERT(B[i]->Stride() == B[i+1]->Stride()); + KALDI_ASSERT(C[i]->NumRows() == C[i+1]->NumRows()); + KALDI_ASSERT(C[i]->NumCols() == C[i+1]->NumCols()); + KALDI_ASSERT(C[i]->Stride() == C[i+1]->Stride()); + } + // CUBLAS is col-major, cudamatrix is row-major, how to do the mapping? + // keep trans..., just swap A&B matrices: A->B B->A + MatrixIndexT m = ((transB==kTrans)? B[0]->NumRows() : B[0]->NumCols()); + MatrixIndexT n = ((transA==kTrans)? A[0]->NumCols() : A[0]->NumRows()); + MatrixIndexT k = ((transB==kTrans)? B[0]->NumCols() : B[0]->NumRows()); + MatrixIndexT k1 = ((transA==kTrans)? A[0]->NumRows() : A[0]->NumCols()); + + KALDI_ASSERT(m == C[0]->NumCols()); + KALDI_ASSERT(n == C[0]->NumRows()); + KALDI_ASSERT(k == k1); + + if (m == 0) return; + +#if HAVE_CUDA == 1 + if (CuDevice::Instantiate().Enabled()) { + Timer tim; + Real **device_abc_array = + static_cast(CuDevice::Instantiate().Malloc(3 * size * sizeof(Real*))); + const Real **device_a_array = const_cast(device_abc_array); + const Real **device_b_array = const_cast(device_abc_array) + size; + Real **device_c_array = device_abc_array + 2 * size; + const Real **host_abc_array = new const Real*[3*size]; + const Real **host_a_array = host_abc_array; + const Real **host_b_array = host_abc_array + size; + const Real **host_c_array = host_abc_array + 2 * size; + + for (int32 i = 0; i < size; i++) { + host_a_array[i] = A[i]->data_; + host_b_array[i] = B[i]->data_; + host_c_array[i] = C[i]->data_; + } + + CU_SAFE_CALL(cudaMemcpy(device_abc_array, host_abc_array, 3*size*sizeof(Real*), cudaMemcpyHostToDevice)); + + CU_SAFE_CALL(cublas_gemmBatched(GetCublasHandle(), + (transB==kTrans? CUBLAS_OP_T:CUBLAS_OP_N), + (transA==kTrans? CUBLAS_OP_T:CUBLAS_OP_N), + m, n, k, alpha, device_b_array, B[0]->Stride(), + device_a_array, A[0]->Stride(), beta, + device_c_array, C[0]->Stride(), size)); + + CuDevice::Instantiate().Free(device_abc_array); + delete[] host_abc_array; + + CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); + } else +#endif + { + for (int32 i = 0; i < size; i++) { + C[i]->Mat().AddMatMat(alpha, A[i]->Mat(), transA, B[i]->Mat(), transB, beta); + } + } +} + +template +void AddMatMatBatched(const float alpha, std::vector* > &C, + const std::vector* > &A, MatrixTransposeType transA, + const std::vector* > &B, MatrixTransposeType transB, + const float beta); + +template +void AddMatMatBatched(const double alpha, std::vector* > &C, + const std::vector* > &A, MatrixTransposeType transA, + const std::vector* > &B, MatrixTransposeType transB, + const double beta); template void CuMatrixBase::CopyRowsFromVec(const CuVectorBase &v) { @@ -1722,10 +1886,11 @@ void CuMatrixBase::CopyRowsFromVec(const CuVectorBase &v) { cudaMemcpyDeviceToDevice)); } } else if (v.Dim() == num_cols_) { - dim3 dimBlock(CU2DBLOCK, CU2DBLOCK); - // this is a newer kernel where (x,y) dims represent (rows,cols). - dim3 dimGrid(n_blocks(NumRows(),CU2DBLOCK), n_blocks(NumCols(),CU2DBLOCK)); + dim3 dimGrid, dimBlock; + GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(), + &dimGrid, &dimBlock); cuda_copy_rows_from_vec(dimGrid, dimBlock, data_, this->Dim(), v.Data()); + CU_SAFE_CALL(cudaGetLastError()); } else { KALDI_ERR << "Wrong sized arguments"; } @@ -1755,17 +1920,11 @@ void CuMatrixBase::CopyRowsFromVec(const VectorBase &v) { } } } else if (v.Dim() == num_cols_) { - dim3 dimBlock(CU2DBLOCK, CU2DBLOCK); - // This is a newer kernel where x corresponds to NumRows() and y to NumCols(). - dim3 dimGrid(n_blocks(NumRows(), CU2DBLOCK), - n_blocks(NumCols(), CU2DBLOCK)); - + dim3 dimGrid, dimBlock; + GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(), + &dimGrid, &dimBlock); cuda_copy_rows_from_vec(dimGrid, dimBlock, this->data_, this->Dim(), v.Data()); CU_SAFE_CALL(cudaGetLastError()); - - /* const Real *v_data = v.Data(); - for (MatrixIndexT r = 0; r < num_rows_; r++) - cudaMemcpy(RowData(r), v_data, sizeof(Real)*num_cols_, cudaMemcpyHostToDevice); */ } else { KALDI_ERR << "Wrong sized arguments"; } @@ -1787,9 +1946,9 @@ void CuMatrixBase::CopyColFromVec(const CuVectorBase &v, #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { Timer tim; - int dimBlock(CU1DBLOCK); - int dimGrid(n_blocks(NumRows(), CU1DBLOCK)); - cuda_copy_col_from_vec(dimGrid, dimBlock, data_, v.Data(), col, Dim()); + cublas_copy(GetCublasHandle(), + v.Dim(), v.Data(), 1, + this->data_ + col, this->stride_); CU_SAFE_CALL(cudaGetLastError()); CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); } else @@ -1804,10 +1963,9 @@ void CuMatrixBase::ApplyPow(Real power) { #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { Timer tim; - dim3 dimBlock(CU2DBLOCK, CU2DBLOCK); - dim3 dimGrid(n_blocks(NumRows(), CU2DBLOCK), - n_blocks(NumCols(), CU2DBLOCK)); - + dim3 dimGrid, dimBlock; + GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(), + &dimGrid, &dimBlock); cuda_apply_pow(dimGrid, dimBlock, data_, power, Dim()); CU_SAFE_CALL(cudaGetLastError()); CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); @@ -1823,10 +1981,9 @@ void CuMatrixBase::ApplyPowAbs(Real power, bool include_sign) { #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { Timer tim; - dim3 dimBlock(CU2DBLOCK, CU2DBLOCK); - dim3 dimGrid(n_blocks(NumRows(), CU2DBLOCK), - n_blocks(NumCols(), CU2DBLOCK)); - + dim3 dimGrid, dimBlock; + GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(), + &dimGrid, &dimBlock); cuda_apply_pow_abs(dimGrid, dimBlock, data_, power, include_sign, Dim()); CU_SAFE_CALL(cudaGetLastError()); CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); @@ -1842,10 +1999,9 @@ void CuMatrixBase::ApplyHeaviside() { #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { Timer tim; - dim3 dimBlock(CU2DBLOCK, CU2DBLOCK); - dim3 dimGrid(n_blocks(NumRows(), CU2DBLOCK), - n_blocks(NumCols(), CU2DBLOCK)); - + dim3 dimGrid, dimBlock; + GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(), + &dimGrid, &dimBlock); cuda_apply_heaviside(dimGrid, dimBlock, data_, Dim()); CU_SAFE_CALL(cudaGetLastError()); CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); @@ -1856,15 +2012,35 @@ void CuMatrixBase::ApplyHeaviside() { } } +template +void CuMatrixBase::Heaviside(const CuMatrixBase &src) { + KALDI_ASSERT(SameDim(*this, src)); +#if HAVE_CUDA == 1 + if (CuDevice::Instantiate().Enabled()) { + Timer tim; + dim3 dimGrid, dimBlock; + GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(), + &dimGrid, &dimBlock); + cuda_heaviside(dimGrid, dimBlock, this->data_, src.data_, this->Dim(), + src.Stride()); + CU_SAFE_CALL(cudaGetLastError()); + + CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); + } else + #endif + { + Mat().Heaviside(src.Mat()); + } +} template void CuMatrixBase::ApplyExp() { #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { Timer tim; - dim3 dimBlock(CU2DBLOCK, CU2DBLOCK); - dim3 dimGrid(n_blocks(NumCols(), CU2DBLOCK), n_blocks(NumRows(), CU2DBLOCK)); - + dim3 dimGrid, dimBlock; + GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(), + &dimGrid, &dimBlock); cuda_apply_exp(dimGrid, dimBlock, data_, Dim()); CU_SAFE_CALL(cudaGetLastError()); CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); @@ -1881,9 +2057,9 @@ void CuMatrixBase::ApplyFloor(Real floor_val) { #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { Timer tim; - dim3 dimBlock(CU2DBLOCK, CU2DBLOCK); - dim3 dimGrid(n_blocks(NumCols(), CU2DBLOCK), n_blocks(NumRows(), CU2DBLOCK)); - + dim3 dimGrid, dimBlock; + GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(), + &dimGrid, &dimBlock); cuda_apply_floor(dimGrid, dimBlock, data_, floor_val, Dim()); CU_SAFE_CALL(cudaGetLastError()); CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); @@ -1899,9 +2075,9 @@ void CuMatrixBase::ApplyCeiling(Real ceiling_val) { #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { Timer tim; - dim3 dimBlock(CU2DBLOCK, CU2DBLOCK); - dim3 dimGrid(n_blocks(NumCols(), CU2DBLOCK), n_blocks(NumRows(), CU2DBLOCK)); - + dim3 dimGrid, dimBlock; + GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(), + &dimGrid, &dimBlock); cuda_apply_ceiling(dimGrid, dimBlock, data_, ceiling_val, Dim()); CU_SAFE_CALL(cudaGetLastError()); CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); @@ -1922,6 +2098,7 @@ void VectorBase::CopyRowsFromMat(const CuMatrixBase &mat) { if (mat.Stride() == mat.NumCols()) { cudaMemcpy(data_, mat.Data(), sizeof(Real)*dim_, cudaMemcpyDeviceToHost); } else { + // we could definitely do better than the following. Real* vec_data = data_; for (MatrixIndexT r = 0; r < mat.NumRows(); r++) { cudaMemcpy(vec_data, mat.RowData(r), sizeof(Real) * mat.NumCols(), @@ -1952,9 +2129,9 @@ void CuMatrixBase::CopyCols(const CuMatrixBase &src, KALDI_ASSERT(indices.Dim() == NumCols()); KALDI_ASSERT(NumRows() == src.NumRows()); Timer tim; - dim3 dimBlock(CU2DBLOCK, CU2DBLOCK); - // This kernel, as it is newer has the (x,y) dims as (rows,cols). - dim3 dimGrid(n_blocks(NumRows(), CU2DBLOCK), n_blocks(NumCols(), CU2DBLOCK)); + dim3 dimGrid, dimBlock; + GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(), + &dimGrid, &dimBlock); cuda_copy_cols(dimGrid, dimBlock, data_, src.Data(), indices.Data(), Dim(), src.Stride()); CU_SAFE_CALL(cudaGetLastError()); CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); @@ -1965,7 +2142,7 @@ void CuMatrixBase::CopyCols(const CuMatrixBase &src, } } - + template void CuMatrixBase::CopyRows(const CuMatrixBase &src, const CuArray &indices) { @@ -1973,12 +2150,13 @@ void CuMatrixBase::CopyRows(const CuMatrixBase &src, if (CuDevice::Instantiate().Enabled()) { KALDI_ASSERT(static_cast(indices.Dim()) == NumRows()); KALDI_ASSERT(NumCols() == src.NumCols()); - + Timer tim; - dim3 dimBlock(CU2DBLOCK, CU2DBLOCK); - // This kernel, as it is newer has the (x,y) dims as (rows,cols). - dim3 dimGrid(n_blocks(NumRows(), CU2DBLOCK), n_blocks(NumCols(), CU2DBLOCK)); - cuda_copy_rows(dimGrid, dimBlock, data_, src.Data(), indices.Data(), Dim(), src.Stride()); + dim3 dimGrid, dimBlock; + GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(), + &dimGrid, &dimBlock); + cuda_copy_rows(dimGrid, dimBlock, data_, src.Data(), indices.Data(), + Dim(), src.Stride()); CU_SAFE_CALL(cudaGetLastError()); CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); } else @@ -1996,10 +2174,11 @@ void CuMatrixBase::AddCols(const CuMatrixBase &src, KALDI_ASSERT(indices.Dim() == NumCols()); KALDI_ASSERT(NumRows() == src.NumRows()); Timer tim; - dim3 dimBlock(CU2DBLOCK, CU2DBLOCK); - // This kernel, as it is newer has the (x,y) dims as (rows,cols). - dim3 dimGrid(n_blocks(NumRows(), CU2DBLOCK), n_blocks(NumCols(), CU2DBLOCK)); - cuda_add_cols(dimGrid, dimBlock, data_, src.Data(), indices.Data(), Dim(), src.Stride()); + dim3 dimGrid, dimBlock; + GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(), + &dimGrid, &dimBlock); + cuda_add_cols(dimGrid, dimBlock, data_, src.Data(), indices.Data(), + Dim(), src.Stride()); CU_SAFE_CALL(cudaGetLastError()); CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); } else @@ -2008,18 +2187,17 @@ void CuMatrixBase::AddCols(const CuMatrixBase &src, Mat().AddCols(src.Mat(), indices.Data()); } } - + template void CuMatrixBase::CopyRows(const CuArray &src) { if (NumRows() == 0) return; #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { KALDI_ASSERT(static_cast(src.Dim()) == NumRows()); - Timer tim; dim3 dimBlock(CU2DBLOCK, CU2DBLOCK); - dim3 dimGrid(n_blocks(NumRows(), CU2DBLOCK), - n_blocks(NumCols(), CU2DBLOCK)); + dim3 dimGrid(n_blocks(num_cols_, CU2DBLOCK), + n_blocks(num_rows_, CU2DBLOCK)); cuda_copy_rows(dimGrid, dimBlock, data_, src.Data(), Dim()); CU_SAFE_CALL(cudaGetLastError()); CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); @@ -2037,11 +2215,11 @@ void CuMatrixBase::CopyToRows(const CuArray &dst) const { #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { KALDI_ASSERT(static_cast(dst.Dim()) == NumRows()); - + Timer tim; dim3 dimBlock(CU2DBLOCK, CU2DBLOCK); - dim3 dimGrid(n_blocks(NumRows(), CU2DBLOCK), - n_blocks(NumCols(), CU2DBLOCK)); + dim3 dimGrid(n_blocks(num_cols_, CU2DBLOCK), + n_blocks(num_rows_, CU2DBLOCK)); cuda_copy_to_rows(dimGrid, dimBlock, dst.Data(), data_, Dim()); CU_SAFE_CALL(cudaGetLastError()); CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); @@ -2062,11 +2240,10 @@ void CuMatrixBase::AddRows(Real alpha, if (CuDevice::Instantiate().Enabled()) { KALDI_ASSERT(static_cast(indexes.Dim()) == NumRows()); KALDI_ASSERT(src.NumCols() == NumCols()); - Timer tim; - dim3 dimBlock(CU2DBLOCK, CU2DBLOCK); - dim3 dimGrid(n_blocks(NumRows(), CU2DBLOCK), - n_blocks(NumCols(), CU2DBLOCK)); + dim3 dimGrid, dimBlock; + GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(), + &dimGrid, &dimBlock); cuda_add_rows(dimGrid, dimBlock, alpha, data_, src.Data(), indexes.Data(), Dim(), src.Stride()); CU_SAFE_CALL(cudaGetLastError()); @@ -2085,11 +2262,10 @@ void CuMatrixBase::AddRows(Real alpha, const CuArray &src) { #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { KALDI_ASSERT(static_cast(src.Dim()) == NumRows()); - Timer tim; - dim3 dimBlock(CU2DBLOCK, CU2DBLOCK); - dim3 dimGrid(n_blocks(NumRows(), CU2DBLOCK), - n_blocks(NumCols(), CU2DBLOCK)); + dim3 dimGrid, dimBlock; + GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(), + &dimGrid, &dimBlock); cuda_add_rows(dimGrid, dimBlock, alpha, data_, src.Data(), Dim()); CU_SAFE_CALL(cudaGetLastError()); CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); @@ -2108,11 +2284,10 @@ void CuMatrixBase::AddToRows(Real alpha, #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { KALDI_ASSERT(static_cast(dst.Dim()) == NumRows()); - Timer tim; - dim3 dimBlock(CU2DBLOCK, CU2DBLOCK); - dim3 dimGrid(n_blocks(NumRows(), CU2DBLOCK), - n_blocks(NumCols(), CU2DBLOCK)); + dim3 dimGrid, dimBlock; + GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(), + &dimGrid, &dimBlock); cuda_add_to_rows(dimGrid, dimBlock, alpha, dst.Data(), data_, Dim()); CU_SAFE_CALL(cudaGetLastError()); CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); @@ -2132,12 +2307,12 @@ void CuMatrixBase::SumColumnRanges(const CuMatrixBase &src, if (NumRows() == 0) return; #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { - Timer tim; - dim3 dimBlock(CU2DBLOCK, CU2DBLOCK); - // This kernel, as it is newer has the (x,y) dims as (rows,cols). - dim3 dimGrid(n_blocks(NumRows(), CU2DBLOCK), n_blocks(NumCols(), CU2DBLOCK)); - cuda_sum_column_ranges(dimGrid, dimBlock, data_, Dim(), src.Data(), src.Dim(), indices.Data()); + dim3 dimGrid, dimBlock; + GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(), + &dimGrid, &dimBlock); + cuda_sum_column_ranges(dimGrid, dimBlock, data_, Dim(), src.Data(), + src.Dim(), indices.Data()); CU_SAFE_CALL(cudaGetLastError()); CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); } else @@ -2165,16 +2340,15 @@ void CuMatrixBase::SumColumnRanges(const CuMatrixBase &src, template void CuMatrixBase::AddRowRanges(const CuMatrixBase &src, const CuArray &indexes) { - KALDI_ASSERT(static_cast(indexes.Dim()) == NumCols()); - KALDI_ASSERT(src.NumCols() >= NumCols()); + KALDI_ASSERT(static_cast(indexes.Dim()) == NumRows()); + KALDI_ASSERT(src.NumCols() == NumCols()); if (NumRows() == 0) return; #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { - Timer tim; - dim3 dimBlock(CU2DBLOCK, CU2DBLOCK); - dim3 dimGrid(n_blocks(NumRows(), CU2DBLOCK), - n_blocks(NumCols(), CU2DBLOCK)); + dim3 dimGrid, dimBlock; + GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(), + &dimGrid, &dimBlock); cuda_add_row_ranges(dimGrid, dimBlock, data_, Dim(), src.Data(), src.Dim(), indexes.Data()); CU_SAFE_CALL(cudaGetLastError()); @@ -2188,9 +2362,9 @@ void CuMatrixBase::AddRowRanges(const CuMatrixBase &src, const Real *src_data = src.data_; const Int32Pair *indexes_data = indexes.Data(); for (int32 row = 0; row < num_rows; row++) { + int32 start_row = indexes_data[row].first, + end_row = indexes_data[row].second; for (int32 col = 0; col < num_cols; col++) { - int32 start_row = indexes_data[col].first, - end_row = indexes_data[col].second; Real sum = 0.0; for (int32 src_row = start_row; src_row < end_row; src_row++) sum += src_data[src_row * src_stride + col]; @@ -2209,7 +2383,7 @@ void CuMatrixBase::CopyLowerToUpper() { if (CuDevice::Instantiate().Enabled()) { Timer tim; dim3 dimBlock(CU2DBLOCK, CU2DBLOCK); - int32 dim = this->num_rows_; + int32 dim = num_rows_; dim3 dimGrid(n_blocks(dim, CU2DBLOCK), n_blocks(dim, CU2DBLOCK)); cuda_copy_low_upp(dimGrid, dimBlock, data_, Dim()); @@ -2229,8 +2403,8 @@ void CuMatrixBase::CopyUpperToLower() { #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { Timer tim; - dim3 dimBlock(CU2DBLOCK, CU2DBLOCK); int32 dim = this->num_rows_; + dim3 dimBlock(CU2DBLOCK, CU2DBLOCK); dim3 dimGrid(n_blocks(dim, CU2DBLOCK), n_blocks(dim, CU2DBLOCK)); cuda_copy_upp_low(dimGrid, dimBlock, data_, Dim()); @@ -2295,10 +2469,10 @@ Real CuMatrixBase::Trace(bool check_square) const { int dimBlock(CU1DBLOCK); int dimGrid = 1;// only 1 block here. we have loops in each thread //(n_blocks(dim_, CU1DBLOCK)); cuda_vec_sum(dimGrid, dimBlock, data_, tmp.Data(), dim, Stride() + 1); - CU_SAFE_CALL(cudaGetLastError()); - CuDevice::Instantiate().AccuProfile("CuVectorBase::Sum", tim.Elapsed()); + CU_SAFE_CALL(cudaGetLastError()); + CuDevice::Instantiate().AccuProfile("CuVectorBase::Sum", tim.Elapsed()); return tmp(0); - } else + } else #endif { return Mat().Trace(check_square); @@ -2348,7 +2522,7 @@ void CuMatrixBase::SetRandn() { if (CuDevice::Instantiate().Enabled()) { CuRand tmp; tmp.RandGaussian(this); - } else + } else #endif { Mat().SetRandn(); @@ -2362,7 +2536,7 @@ void CuMatrixBase::SetRandUniform() { if (CuDevice::Instantiate().Enabled()) { CuRand tmp; tmp.RandUniform(this); - } else + } else #endif { Mat().SetRandUniform(); @@ -2403,26 +2577,17 @@ template void CuMatrix::Transpose() { if (this->num_rows_ == 0) return; -#if HAVE_CUDA == 1 - if (this->num_rows_ == this->num_cols_ && CuDevice::Instantiate().Enabled()) { - Timer tim; - dim3 dimBlock(CU2DBLOCK, CU2DBLOCK); - // (x,y) indices will be (row of *this, col of *this) - dim3 dimGrid(n_blocks(this->num_rows_, CU2DBLOCK), - n_blocks(this->num_cols_, CU2DBLOCK)); - cuda_transpose_matrix(dimGrid, dimBlock, this->data_, this->Dim()); - CU_SAFE_CALL(cudaGetLastError()); - CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); - } else -#endif - { - CuMatrix tmp(*this, kTrans); - *this = tmp; - } + // Copy and swap for all cases. + // No need for a separate kernel of squared matrix in-place transpose. + // It has the same posible peak performance as copy transpose, + // if allocate/deallocate overhead can be ignored. + CuMatrix tmp(*this, kTrans); + this->Swap(&tmp); } // Version of AddMatMat where 2nd argument is of type CuBlockMatrix. +// Caution: template void CuMatrixBase::AddMatBlock( Real alpha, @@ -2451,19 +2616,21 @@ void CuMatrixBase::AddMatBlock( if (CuDevice::Instantiate().Enabled()) { Timer tim; MatrixDim this_dim = Dim(); - + dim3 dimBlock(CU2DBLOCK, CU2DBLOCK); // (x,y) indices will be (row of *this, block of B) dim3 dimGrid(n_blocks(num_rows_, CU2DBLOCK), n_blocks(B_num_blocks, CU2DBLOCK)); + // caution: the use of x as the row-index is not good, but + // this code is not much used, so I'm not updating it.a cuda_add_mat_blockmat(dimGrid, dimBlock, data_, this_dim, A.Data(), A_num_rows, A_num_cols, A_row_stride, A_col_stride, B.CuData(), B_num_blocks, alpha, beta, (transB == kTrans ? 1 : 0)); - - CU_SAFE_CALL(cudaGetLastError()); - + + CU_SAFE_CALL(cudaGetLastError()); + CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); } else #endif @@ -2494,7 +2661,7 @@ void CuMatrixBase::AddMatBlock( } template -void CuMatrixBase::AddElements(Real alpha, +void CuMatrixBase::AddElements(Real alpha, const std::vector >& input) { // Checks the dimension. MatrixIndexT num_rows = this->num_rows_, num_cols = this->num_cols_; @@ -2511,7 +2678,7 @@ void CuMatrixBase::AddElements(Real alpha, Timer tim; int dimBlock(CU1DBLOCK); - int dimGrid = 1;// only 1 block here. we have loops in each thread //(n_blocks(dim_, CU1DBLOCK)); + int dimGrid(n_blocks(input.size(), CU1DBLOCK)); cuda_matrix_add_elements(dimGrid, dimBlock, this->data_, this->Dim(), alpha, (MatrixElement*)addr, input.size()); @@ -2527,47 +2694,97 @@ void CuMatrixBase::AddElements(Real alpha, } } +template +void CuMatrixBase::AddElements(Real alpha, const CuArray &indexes, + const Real *input) { + if (indexes.Dim() == 0) return; + KALDI_ASSERT(input != NULL); + +#if HAVE_CUDA == 1 + if (CuDevice::Instantiate().Enabled()) { + Timer tim; + CuVector tmp_vec(indexes.Dim(), kUndefined); + CU_SAFE_CALL(cudaMemcpy(tmp_vec.Data(), input, indexes.Dim() * sizeof(Real), + cudaMemcpyHostToDevice)); + + int dimBlock(CU1DBLOCK); + int dimGrid = n_blocks(indexes.Dim(), CU1DBLOCK); + cuda_matrix_add_indexed_values(dimGrid, dimBlock, this->Dim(), alpha, + indexes.Data(), tmp_vec.Data(), indexes.Dim(), this->data_); + CU_SAFE_CALL(cudaGetLastError()); + CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); + } else +#endif + { + MatrixIndexT num_rows = this->num_rows_, num_cols = this->num_cols_; + const Int32Pair *index = indexes.Data(); + for (int32 i = 0; i < indexes.Dim(); i++) { + KALDI_ASSERT(index[i].first < num_rows && index[i].first >= 0 && + index[i].second < num_cols && index[i].second >= 0); + (*this)(index[i].first, index[i].second) += alpha * input[i]; + } + } +} + template void CuMatrixBase::Lookup(const std::vector &indices, - std::vector *output) const { + Real *output) const { // Checks the dimension. MatrixIndexT num_rows = this->num_rows_, num_cols = this->num_cols_; for (int32 i = 0; i < indices.size(); ++i) { KALDI_ASSERT(indices[i].first < num_rows && indices[i].first >= 0 && indices[i].second < num_cols && indices[i].second >= 0); } - - // Checks the pointer. + if (indices.size() == 0) return; KALDI_ASSERT(output != NULL); - // Resizes the output vector. - output->resize(indices.size()); - #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { CuArray cuda_indices(indices); - CuArray cuda_output(output->size()); + Lookup(cuda_indices, output); + } else +#endif + { + for (int32 i = 0; i < indices.size(); i++) { + output[i] = (*this)(indices[i].first, indices[i].second); + } + } +} + +template +void CuMatrixBase::Lookup(const CuArray &indices, + Real *output) const { + int32 num_elements = indices.Dim(); + if (num_elements == 0) return; + KALDI_ASSERT(output != NULL); +#if HAVE_CUDA == 1 + if (CuDevice::Instantiate().Enabled()) { + CuArray cuda_output(num_elements); Timer tim; dim3 dimBlock(CU1DBLOCK, 1); - dim3 dimGrid(n_blocks(indices.size(), CU1DBLOCK), 1); - + dim3 dimGrid(n_blocks(num_elements, CU1DBLOCK), 1); + cuda_matrix_lookup(dimGrid, dimBlock, this->data_, this->Dim(), - cuda_indices.Data(), indices.size(), cuda_output.Data()); + indices.Data(), num_elements, cuda_output.Data()); CU_SAFE_CALL(cudaGetLastError()); - cuda_output.CopyToVec(output); - + cuda_output.CopyToHost(output); CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); } else #endif { - for (int32 i = 0; i < indices.size(); i++) { - (*output)[i] = (*this)(indices[i].first, indices[i].second); + MatrixIndexT num_rows = this->num_rows_, num_cols = this->num_cols_; + const Int32Pair *index = indices.Data(); + for (int32 i = 0; i < num_elements; i++) { + KALDI_ASSERT(index[i].first < num_rows && index[i].first >= 0 && + index[i].second < num_cols && index[i].second >= 0); + output[i] = (*this)(index[i].first, index[i].second); } } } + template void CuMatrixBase::EqualElementMask(const CuMatrixBase &mat, CuMatrix *mask) const { // Check the inputs: @@ -2579,10 +2796,12 @@ void CuMatrixBase::EqualElementMask(const CuMatrixBase &mat, CuMatri #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { Timer tim; - dim3 dimBlock(CU2DBLOCK, CU2DBLOCK); - dim3 dimGrid(n_blocks(NumCols(), CU2DBLOCK), n_blocks(NumRows(), CU2DBLOCK)); - - cuda_equal_element_mask(dimGrid, dimBlock, this->data_, mat.Data(), mask->Data(), this->Dim(), mat.Stride(), mask->Stride()); + dim3 dimGrid, dimBlock; + GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(), + &dimGrid, &dimBlock); + cuda_equal_element_mask(dimGrid, dimBlock, this->data_, mat.Data(), + mask->Data(), this->Dim(), mat.Stride(), + mask->Stride()); CU_SAFE_CALL(cudaGetLastError()); CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); @@ -2611,7 +2830,7 @@ std::ostream &operator << (std::ostream &out, const CuMatrixBase &mat) { // instantiate the template template std::ostream &operator << (std::ostream &out, const CuMatrixBase &mat); -template +template std::ostream &operator << (std::ostream &out, const CuMatrixBase &mat); @@ -2621,7 +2840,7 @@ template class CuMatrix; template class CuMatrixBase; template class CuMatrixBase; - + diff --git a/src/cudamatrix/cu-matrix.h b/src/cudamatrix/cu-matrix.h index 0fcb517994c..fec26424ef8 100644 --- a/src/cudamatrix/cu-matrix.h +++ b/src/cudamatrix/cu-matrix.h @@ -27,6 +27,7 @@ #define KALDI_CUDAMATRIX_CU_MATRIX_H_ #include +#include #include "cudamatrix/cu-matrixdim.h" #include "cudamatrix/cu-common.h" @@ -43,6 +44,17 @@ namespace kaldi { template Real TraceMatMat(const CuMatrixBase &A, const CuMatrixBase &B, MatrixTransposeType trans = kNoTrans); + +/// Does multiple matrix multiplications, executing them in parallel using +/// cuBLAS's gemmBatched if we are using a GPU. Vectors A, B and C must have +/// the same length; for each i, this function executes the matrix operation +/// C[i] = alpha * A[i](^T)*B[i](^T) + beta * C[i]. +template +void AddMatMatBatched(const Real alpha, std::vector* > &C, + const std::vector* > &A, MatrixTransposeType transA, + const std::vector* > &B, MatrixTransposeType transB, + const Real beta); + /** * Matrix for CUDA computing. * Does the computation on the CUDA card when CUDA is compiled in and @@ -95,9 +107,9 @@ class CuMatrixBase { /// and src.NumRows() must equal this.NumRows() void AddCols(const CuMatrixBase &src, const CuArray &indices); - + /// Copies row r from row indexes[r] of src. - /// As a special case, if indexes[i] < 0, sets row i to zero + /// As a special case, if indexes[i] < 0, sets row i to zero /// "reorder".size() must equal this->NumRows(), and /// src.NumCols() must equal this.NumCols() void CopyRows(const CuMatrixBase &src, @@ -111,19 +123,18 @@ class CuMatrixBase { /// NULL then this.Row(r) will be set to zero. void CopyRows(const CuArray &src); - /// For each row r of this matrix, copies it to the array of floats at - /// the location given by dst[r], where dst[r] is assumed to be obtained from the RowData() - /// function of another CuMatrix, or from CuVector::Data() (i.e. it should point - /// to memory on the GPU if we're using a GPU, or on the CPU otherwise). - /// If dst[r] is NULL, does not copy anywhere. Requires that none of the - /// memory regions pointed to by the pointers in "dst" overlap (e.g. none of - /// the pointers should be the same). + /// For each row r of this matrix, copies it to the array of floats at the + /// location given by dst[r], where dst[r] is assumed to be obtained from the + /// RowData() function of another CuMatrix, or from CuVector::Data() (i.e. it + /// should point to memory on the GPU if we're using a GPU, or on the CPU + /// otherwise). If dst[r] is NULL, does not copy anywhere. Requires that + /// none of the memory regions pointed to by the pointers in "dst" overlap + /// (e.g. none of the pointers should be the same). void CopyToRows(const CuArray &dst) const; - /// Does for each row r, this.Row(r) += alpha * src.row(indexes[r]). /// If indexes[r] < 0, does not add anything. - /// "reorder".size() must equal this->NumRows(), + /// "reorder".size() must equal this->NumRows(), /// all elements of "reorder" must be in [0, src.NumRows()-1], /// and src.NumCols() must equal this.NumCols() void AddRows(Real alpha, @@ -135,7 +146,7 @@ class CuMatrixBase { /// a vector of floats, of the same length as this.NumCols(). void AddRows(Real alpha, const CuArray &src); - + /// For each row r of this matrix, adds it (times alpha) to the array of /// floats at the location given by dst[r], where dst[r] is assumed to be @@ -145,7 +156,7 @@ class CuMatrixBase { /// for that row. Requires that none of the memory regions pointed to by the /// pointers in "dst" overlap (e.g. none of the pointers should be the same). void AddToRows(Real alpha, const CuArray &dst) const; - + /// For each row r of this and for each column c, sets (*this)(r, c) to the /// sum \sum_j src(r, j), where j ranges from indexes[c].first through @@ -156,12 +167,12 @@ class CuMatrixBase { /// For each row r of this and for each column c, do /// (*this)(r, c) += \sum_j src(j, c), - /// where j ranges from indexes[c].first through indexes[c].second - 1. - /// All indexes must be >= 0 and < src.NumRows(); to represent an empty range - /// just use the same index twice. + /// where j ranges from indexes[r].first through indexes[r].second - 1. + /// In general indexes must be >= 0 and < src.NumRows(); but to represent an empty range + /// you may use the pair (-1, -1) or any pair of numbers (i, j) such that i >= j. void AddRowRanges(const CuMatrixBase &src, const CuArray &indexes); - + friend Real TraceMatMat(const CuMatrixBase &A, const CuMatrixBase &B, @@ -171,10 +182,15 @@ class CuMatrixBase { const CuSparseMatrix &B, MatrixTransposeType trans); + friend void AddMatMatBatched(const Real alpha, std::vector* > &C, + const std::vector* > &A, MatrixTransposeType transA, + const std::vector* > &B, MatrixTransposeType transB, + const Real beta); + /// Adds "value" to the diagonal elements of the matrix. The matrix /// *this does not have to be square. void AddToDiag(Real value); - + /// Dimensions MatrixIndexT NumRows() const { return num_rows_; } MatrixIndexT NumCols() const { return num_cols_; } @@ -182,21 +198,21 @@ class CuMatrixBase { // MatrixDim is a struct containing "rows", "cols" and "stride", // that is an argument of most CUDA kernels. - ::MatrixDim Dim() const { - ::MatrixDim d = { num_rows_, num_cols_, stride_ }; - return d; + ::MatrixDim Dim() const { + ::MatrixDim d = { num_rows_, num_cols_, stride_ }; + return d; } Real FrobeniusNorm() const { return sqrt(TraceMatMat(*this, *this, kTrans)); } - bool IsUnit(Real tol = 0.001) const; + bool IsUnit(Real tol = 0.001) const; /// True if ((*this)-other).FrobeniusNorm() <= tol * this->FrobeniusNorm() bool ApproxEqual(const CuMatrixBase &other, float tol = 0.01) const; - + /// Get size of matrix in bytes MatrixIndexT SizeInBytes() const { return num_rows_*stride_*sizeof(Real); } - + // Copy functions. These do not resize. template void CopyFromMat(const MatrixBase &src, @@ -209,23 +225,28 @@ class CuMatrixBase { MatrixTransposeType trans = kNoTrans); void CopyFromSp(const CuSpMatrix &M); - + template void CopyFromTp(const CuTpMatrix &M, MatrixTransposeType trans = kNoTrans); - + template void CopyFromMat(const CuMatrixBase &M, - MatrixTransposeType trans = kNoTrans); + MatrixTransposeType trans = kNoTrans); template void CopyToMat(MatrixBase *dst, MatrixTransposeType trans = kNoTrans) const; - + + /// This function has two modes of operation. If v.Dim() == NumRows() * + /// NumCols(), then treats the vector as a row-by-row concatenation of a + /// matrix and copies to *this. + /// if v.Dim() == NumCols(), it sets each row of *this to a copy of v. void CopyRowsFromVec(const CuVectorBase &v); + /// Version of CopyRowsFromVec() that takes a CPU-based vector. void CopyRowsFromVec(const VectorBase &v); - + /// Copy vector into specific column of matrix. void CopyColFromVec(const CuVectorBase &v, const MatrixIndexT col); @@ -233,6 +254,11 @@ class CuMatrixBase { /// element by element, x = 1 / (1 + exp(-x)) void Sigmoid(const CuMatrixBase &src); + /// Set each element to the Heaviside function of the corresponding element + /// of "src", which we define as the function (x > 0 ? 1.0 : 0.0) [note: + /// in general, there are different ways to deal with the situation when x==0.] + void Heaviside(const CuMatrixBase &src); + /// Apply the function y = log(1 + exp(x)), to each element. /// Note: the derivative of this function is the sigmoid function. /// This is like a soft ReLU. @@ -268,7 +294,7 @@ class CuMatrixBase { /// defined (it's not defined where multiple inputs in the group are equal to the output). void GroupMaxDeriv(const CuMatrixBase &input, const CuMatrixBase &output); - + /// Compute the hyperbolic tangent (tanh) function; element by element, /// *this = tanh(src). void Tanh(const CuMatrixBase &src); @@ -282,9 +308,9 @@ class CuMatrixBase { /// tanh output. Does, element-by-element, *this = diff * (1 - value^2). void DiffTanh(const CuMatrixBase &value, const CuMatrixBase &diff); - + /// Differentiate the block [softmax+cross-entropy] : - /// dE/da = posterior_mat - target_mat, + /// dE/da = posterior_mat - target_mat, /// 'E' is error function, 'a' is activation on softmax input /// /// Interface: @@ -293,7 +319,7 @@ class CuMatrixBase { /// log_post_tgt ... per-frame statistics for cross-entropy computations : /// log(sum_row(posterior_mat .* target_mat)) void DiffXent(const CuArray &tgt, - CuVector *log_post_tgt); + CuVector *log_post_tgt); /// This function does sets *this to the Cholesky factor of *this (i.e. the C /// satisfying *this = C C^T), and sets "inv_cholesky" (if supplied) to its @@ -305,17 +331,19 @@ class CuMatrixBase { /// Inversion for positive definite symmetric matrices. /// Treats the input as symmetric but only reads the lower triangle. /// The output is symmetric. - void SymInvertPosDef(); + void SymInvertPosDef(); void ApplyPow(Real power); - ///< Apply power to the absolute value of each element. - ///< If inlude_sign is true, the result will be multiplied with + ///< Apply power to the absolute value of each element. + ///< If inlude_sign is true, the result will be multiplied with ///< the sign of the input value. ///< If the power is negative and the input to the power is zero, ///< The output will be set zero. If include_sign is true, it will ///< multiply the result by the sign of the input. void ApplyPowAbs(Real power, bool include_sign=false); - void ApplyHeaviside(); ///< For each element, sets x = (x > 0 ? 1.0 : 0.0) + /// For each element, sets x = (x > 0 ? 1.0 : 0.0). + /// See also Heaviside(). + void ApplyHeaviside(); void ApplyFloor(Real floor_val); void ApplyCeiling(Real ceiling_val); void ApplyExp(); @@ -331,16 +359,7 @@ class CuMatrixBase { /// Find the id of the maximal element for each row void FindRowMaxId(CuArray *id) const; - - /* - // Copy row interval from matrix - // @param r [in] number of rows to copy. - // @param src [in] source matrix. - // @param src_ro [in] source matrix row offset. - // @param dst_ro [in] destination matrix row offset. - // void CopyRowsFromMat(int32 r, const CuMatrixBase &src, int32 src_ro, int32 dst_ro); - */ - + /// Math operations, some calling kernels void SetZero(); void Set(Real value); @@ -349,18 +368,18 @@ class CuMatrixBase { void SetZeroAboveDiag(); void Scale(Real value); void ApplyLog(); - - /// Multiply two matrices elementwise: C = A .* C + + /// Multiply two matrices elementwise: C = C .* A void MulElements(const CuMatrixBase &A); - /// Divide two matrices elementwise: C = A ./ C + /// Divide two matrices elementwise: C = A ./ A void DivElements(const CuMatrixBase &A); /// Do, elementwise, *this = max(*this, A). void Max(const CuMatrixBase &A); /// scale i'th column by scale[i] - void MulColsVec(const CuVectorBase &scale); + void MulColsVec(const CuVectorBase &scale); /// scale i'th row by scale[i] void MulRowsVec(const CuVectorBase &scale); - /// divide each row into src.NumCols() groups, and then scale i'th row's jth group of elements by src[i, j]. + /// divide each row into src.NumCols() groups, and then scale i'th row's jth group of elements by src[i, j]. void MulRowsGroupMat(const CuMatrixBase &src); /// divide i'th row by scale[i] void DivRowsVec(const CuVectorBase &div); @@ -369,7 +388,11 @@ class CuMatrixBase { /// *this += alpha * A void AddMat(Real alpha, const CuMatrixBase &A, MatrixTransposeType trans = kNoTrans); - + + /// if A.NumRows() is multiple of (*this)->NumRows and A.NumCols() is multiple of (*this)->NumCols + /// divide A into blocks of the same size as (*this) and add them to *this (times alpha) + void AddMatBlocks(Real alpha, const CuMatrixBase &A, MatrixTransposeType trans = kNoTrans); + /// (for each column c of *this), c = alpha * col + beta * c void AddVecToCols(Real alpha, const CuVectorBase &col, Real beta = 1.0); /// (for each row r of *this), r = alpha * row + beta * r @@ -377,6 +400,8 @@ class CuMatrixBase { /// C = alpha * A(^T)*B(^T) + beta * C void AddMatMat(Real alpha, const CuMatrixBase &A, MatrixTransposeType transA, const CuMatrixBase &B, MatrixTransposeType transB, Real beta); + /// A = alpha * x * y^T + A . + void AddVecVec(Real alpha, const CuVectorBase &x, const CuVectorBase &y); /// *this = a * b / c (by element; when c = 0, *this = a) void AddMatMatDivMat(const CuMatrixBase &A, const CuMatrixBase &B, const CuMatrixBase &C); @@ -386,30 +411,30 @@ class CuMatrixBase { void SymAddMat2(const Real alpha, const CuMatrixBase &M, MatrixTransposeType transA, Real beta); - + /// This function is like AddMatMat but for where the second argument is of /// type CuBlockMatrix (a block-diagonal matrix of blocks). void AddMatBlock(Real alpha, const CuMatrixBase &A, MatrixTransposeType transA, const CuBlockMatrix &B, MatrixTransposeType transB, Real beta); - + /// *this = beta * *this + alpha * diag(v) * M [or M^T]. /// The same as adding M but scaling each row M_i by v(i). void AddDiagVecMat(const Real alpha, const CuVectorBase &v, - const CuMatrixBase &M, MatrixTransposeType transM, - Real beta = 1.0); + const CuMatrixBase &M, MatrixTransposeType transM, + Real beta = 1.0); // *this = beta * *this + alpha * M * diag(v) [or M^T]. // The same as adding M but scaling each column M_j by v(j). void AddMatDiagVec(const Real alpha, const CuMatrixBase &M, MatrixTransposeType transM, CuVectorBase &v, - Real beta = 1.0); + Real beta = 1.0); /// *this = beta * *this + alpha * A .* B (.* element by element multiplication) void AddMatMatElements(const Real alpha, - const CuMatrixBase& A, - const CuMatrixBase& B, - const Real beta); + const CuMatrixBase& A, + const CuMatrixBase& B, + const Real beta); /// this <-- beta*this + alpha*A*B void AddMatSp(const Real alpha, @@ -419,7 +444,7 @@ class CuMatrixBase { CuMatrix M(B); return AddMatMat(alpha, A, transA, M, kNoTrans, beta); } - + /// this <-- beta*this + alpha*SpA*B void AddSpMat(const Real alpha, const CuSpMatrix &A, @@ -465,7 +490,7 @@ class CuMatrixBase { } inline CuSubMatrix ColRange(const MatrixIndexT col_offset, const MatrixIndexT num_cols) const { - return CuSubMatrix(*this, 0, num_rows_, col_offset, num_cols); + return CuSubMatrix(*this, 0, num_rows_, col_offset, num_cols); } inline const CuSubVector Row(MatrixIndexT i) const { @@ -487,7 +512,7 @@ class CuMatrixBase { static_cast(num_cols_)); return CuValue(data_ + r * stride_ + c); } - + inline Real operator() (MatrixIndexT r, MatrixIndexT c) const { KALDI_PARANOID_ASSERT(static_cast(r) < static_cast(num_rows_) && @@ -513,11 +538,23 @@ class CuMatrixBase { // (*this). void AddElements(Real alpha, const std::vector >& input); - // This function resizes the output to indexes.size(), and for each element of - // "indexes" it interprets it as a (row, column) index into *this, and puts - // (*this)(row, column) into the corresponding element of "output". + // For each i, with indexes[i] = (j, k), does (*this)(j, k) += input[i]. + // Requires, but does not check, that the vector of indexes does not contrain + // repeated elements, 'input' is the start of an array of length equal to + // indexes.Dim(), which is located on GPU memory if we are using the GPU. + void AddElements(Real alpha, const CuArray &indexes, + const Real *input); + + // This function requires that 'output' is a host array and is allocated with size + // of indexes.size(), and for each element of 'indexes' it interprets it as + // a (row, column) index into *this, and puts (*this)(row, column) into + // the corresponding element of 'output'. void Lookup(const std::vector &indexes, - std::vector *output) const; + Real *output) const; + + // CUDA version of Lookup, would be called internally by the above function. + void Lookup(const CuArray &indexes, + Real *output) const; // Creates binary mask with per-element equality predicates of *this, mat. // Output stored to 'mask', values : 1.0 = equal, 0.0 = not-equal. @@ -547,14 +584,14 @@ class CuMatrixBase { inline MatrixBase &Mat() { return *(reinterpret_cast* >(this)); } - + protected: - + // The constructors are protected to prevent the user creating an instance of // this class (you should create a child class CuMatrix or CuSubMatrix. - + CuMatrixBase(): data_(NULL), num_cols_(0), num_rows_(0), stride_(0) { } - + /// This constructor takes the #rows, #cols and stride; it's called from /// the constructor of CuSubMatrix. CuMatrixBase(Real *data, @@ -574,6 +611,7 @@ class CuMatrixBase { MatrixIndexT num_cols_; MatrixIndexT num_rows_; MatrixIndexT stride_; + private: KALDI_DISALLOW_COPY_AND_ASSIGN(CuMatrixBase); }; // class CuMatrixBase @@ -585,11 +623,12 @@ class CuMatrix: public CuMatrixBase { public: CuMatrix() { } - + /// Constructor with memory initialisation CuMatrix(MatrixIndexT rows, MatrixIndexT cols, - MatrixResizeType resize_type = kSetZero) { - Resize(rows, cols, resize_type); + MatrixResizeType resize_type = kSetZero, + MatrixStrideType stride_type = kDefaultStride) { + Resize(rows, cols, resize_type, stride_type); } // Note: we had to remove the "explicit" keyword due @@ -599,7 +638,7 @@ class CuMatrix: public CuMatrixBase { explicit CuMatrix(const CuBlockMatrix &other, MatrixTransposeType trans = kNoTrans); - + explicit CuMatrix(const CuMatrixBase &other, MatrixTransposeType trans = kNoTrans); @@ -607,7 +646,7 @@ class CuMatrix: public CuMatrixBase { explicit CuMatrix(const MatrixBase &other, MatrixTransposeType trans = kNoTrans); - /// Copy constructor taking SpMatrix... + /// Copy constructor taking SpMatrix... explicit CuMatrix(const CuSpMatrix &M) : CuMatrixBase() { Resize(M.NumRows(), M.NumRows(), kUndefined); this->CopyFromSp(M); @@ -625,7 +664,7 @@ class CuMatrix: public CuMatrixBase { template explicit CuMatrix(const CuMatrixBase &M, MatrixTransposeType trans = kNoTrans); - + CuMatrix &operator = (const CuMatrixBase &other) { this->Resize(other.NumRows(), other.NumCols(), kUndefined); this->CopyFromMat(other); @@ -636,8 +675,8 @@ class CuMatrix: public CuMatrixBase { this->Resize(other.NumRows(), other.NumCols(), kUndefined); this->CopyFromMat(other); return *this; - } - + } + CuMatrix &operator = (const MatrixBase &other) { this->Resize(other.NumRows(), other.NumCols(), kUndefined); this->CopyFromMat(other); @@ -648,14 +687,15 @@ class CuMatrix: public CuMatrixBase { /// Allocate the memory void Resize(MatrixIndexT rows, MatrixIndexT cols, - MatrixResizeType resize_type = kSetZero); - + MatrixResizeType resize_type = kSetZero, + MatrixStrideType stride_type = kDefaultStride); + void Swap(Matrix *mat); void Swap(CuMatrix *mat); template void Swap(CuMatrix *mat); - + /// I/O functions void Read(std::istream &is, bool binary); @@ -698,7 +738,16 @@ class CuSubMatrix: public CuMatrixBase { const MatrixIndexT num_rows, const MatrixIndexT col_offset, const MatrixIndexT num_cols); - + + // This constructor should be used with caution; it can be used for + // constructing 'fake' submatrices if you want to play with + // the stride. 'data' should point to GPU data if you're using the + // GPU. + inline CuSubMatrix(const Real *data, + const MatrixIndexT num_rows, + const MatrixIndexT num_cols, + const MatrixIndexT stride); + /// This type of constructor is needed for Range() to work [in CuMatrix base /// class]. Cannot make it explicit or that breaks. inline CuSubMatrix (const CuSubMatrix &other): @@ -717,8 +766,8 @@ bool ApproxEqual(const CuMatrixBase &A, } template -inline void AssertEqual(CuMatrixBase &A, CuMatrixBase &B, - float tol = 0.01) { +inline void AssertEqual(const CuMatrixBase &A, + const CuMatrixBase &B, float tol = 0.01) { KALDI_ASSERT(A.ApproxEqual(B, tol)); } @@ -742,8 +791,8 @@ template template Matrix::Matrix(const CuMatrixBase &M, MatrixTransposeType trans) { - if (trans == kNoTrans) Init(M.NumRows(), M.NumCols()); - else Init(M.NumCols(), M.NumRows()); + if (trans == kNoTrans) Init(M.NumRows(), M.NumCols(), kDefaultStride); + else Init(M.NumCols(), M.NumRows(), kDefaultStride); M.CopyToMat(this, trans); } diff --git a/src/cudamatrix/cu-matrixdim.h b/src/cudamatrix/cu-matrixdim.h index 32df913b4b1..dab7bd40eb2 100644 --- a/src/cudamatrix/cu-matrixdim.h +++ b/src/cudamatrix/cu-matrixdim.h @@ -57,7 +57,7 @@ extern "C" { // we define the following constants here because this file is included // both by the C++ code and also CUDA code. - + // The size of a CUDA 1-d block, e.g. for vector operations.. #define CU1DBLOCK 256 diff --git a/src/cudamatrix/cu-packed-matrix-test.cc b/src/cudamatrix/cu-packed-matrix-test.cc index 8fb8b2d1182..381ced3b2c2 100644 --- a/src/cudamatrix/cu-packed-matrix-test.cc +++ b/src/cudamatrix/cu-packed-matrix-test.cc @@ -1,4 +1,4 @@ -// cudamatrix/cu-sp-matrix-test.cc +// cudamatrix/cu-packed-matrix-test.cc // // Copyright 2013 Ehsan Variani // Lucas Ondel diff --git a/src/cudamatrix/cu-packed-matrix.cc b/src/cudamatrix/cu-packed-matrix.cc index 017b719a749..fd69e652be0 100644 --- a/src/cudamatrix/cu-packed-matrix.cc +++ b/src/cudamatrix/cu-packed-matrix.cc @@ -22,7 +22,7 @@ #if HAVE_CUDA == 1 #include -#include +#include #endif #include "base/timer.h" @@ -297,8 +297,8 @@ void CuPackedMatrix::Scale(Real alpha) { Timer tim; size_t nr = static_cast(num_rows_), num_elements = ((nr * (nr+1)) / 2); - cublas_scal(num_elements, alpha, data_, 1); - CU_SAFE_CALL(cudaGetLastError()); + CU_SAFE_CALL(cublas_scal(GetCublasHandle(), num_elements, alpha, data_, 1)); + CuDevice::Instantiate().AccuProfile("CuPackedMatrix::Scale", tim.Elapsed()); } else #endif @@ -333,7 +333,7 @@ void CuPackedMatrix::AddPacked(const Real alpha, const CuPackedMatrix &M); - + private: // Disallow assignment. PackedMatrix & operator=(const PackedMatrix &other); diff --git a/src/cudamatrix/cu-sp-matrix-speed-test.cc b/src/cudamatrix/cu-sp-matrix-speed-test.cc index 9ad0f6d23db..455bf58608f 100644 --- a/src/cudamatrix/cu-sp-matrix-speed-test.cc +++ b/src/cudamatrix/cu-sp-matrix-speed-test.cc @@ -1,4 +1,4 @@ -// cudamatrix/cu-matrix-speed-test.cc +// cudamatrix/cu-sp-matrix-speed-test.cc // Copyright 2013 Johns Hopkins University (author: Daniel Povey) @@ -53,7 +53,7 @@ static void UnitTestCuSpMatrixInvert(int32 dim) { if (iter > 0) { B.Invert(); } else { // do some more testing... - + CuMatrix D(A); A.AddMat2(1.0, D, kTrans, 1.0); A.AddToDiag(0.1 * dim); @@ -61,10 +61,10 @@ static void UnitTestCuSpMatrixInvert(int32 dim) { CuMatrix C(B); B.AddMat2(1.0, C, kTrans, 1.0); B.AddToDiag(0.1 * dim); - + A.Invert(); B.Invert(); - + SpMatrix E(dim); B.CopyToSp(&E); @@ -82,7 +82,7 @@ static void UnitTestCuSpMatrixInvert(int32 dim) { template static void UnitTestCuSpMatrixCopyFromMat(int32 dim, SpCopyType copy_type) { - BaseFloat time_in_secs = 0.05; + BaseFloat time_in_secs = 0.01; int32 iter = 0; Timer tim; CuMatrix A(dim, dim); diff --git a/src/cudamatrix/cu-sp-matrix.cc b/src/cudamatrix/cu-sp-matrix.cc index 128d056bad2..2ad5834b796 100644 --- a/src/cudamatrix/cu-sp-matrix.cc +++ b/src/cudamatrix/cu-sp-matrix.cc @@ -20,7 +20,7 @@ #if HAVE_CUDA == 1 #include -#include +#include #endif #include "base/timer.h" @@ -115,9 +115,9 @@ void CuSpMatrix::AddVec2(const Real alpha, const CuVectorBase &v) { dim3 dimBlock(CU2DBLOCK, CU2DBLOCK); dim3 dimGrid(n_blocks(nr, CU2DBLOCK), n_blocks(nr, CU2DBLOCK)); - cublas_spr('U', this->num_rows_, alpha, v.Data(), - 1, this->Data()); - CU_SAFE_CALL(cudaGetLastError()); + CU_SAFE_CALL(cublas_spr(GetCublasHandle(), CUBLAS_FILL_MODE_UPPER, this->num_rows_, alpha, v.Data(), + 1, this->Data())); + CuDevice::Instantiate().AccuProfile("CuSpMatrix::AddVec2", tim.Elapsed()); } else #endif @@ -145,10 +145,10 @@ void CuSpMatrix::AddMat2(const Real alpha, const CuMatrixBase &M, return; } - char trans = (transM == kTrans ? 'N' : 'T'); + cublasOperation_t trans = (transM == kTrans ? CUBLAS_OP_N : CUBLAS_OP_T); CuMatrix tmp_mat(*this); - cublas_syrk('U', trans, this_dim, m_other_dim, alpha, M.Data(), + cublas_syrk(GetCublasHandle(), CUBLAS_FILL_MODE_UPPER, trans, this_dim, m_other_dim, alpha, M.Data(), M.Stride(), beta, tmp_mat.Data(), tmp_mat.Stride()); this->CopyFromMat(tmp_mat, kTakeLower); @@ -218,7 +218,6 @@ bool CuSpMatrix::IsUnit(Real tol) const { // Note: we could do this more efficiently still, by slightly changing the // definition of IsUnit and getting rid of the extra stuff inside TraceSpSp // that corrects for the diagonal being counted twice. - return (TraceSpSp(*this, *this) + this->NumRows() - 2.0 * this->Trace() <= tol * this->NumRows()); } diff --git a/src/cudamatrix/cu-sp-matrix.h b/src/cudamatrix/cu-sp-matrix.h index 2fa46c332f6..2b66c208149 100644 --- a/src/cudamatrix/cu-sp-matrix.h +++ b/src/cudamatrix/cu-sp-matrix.h @@ -135,7 +135,6 @@ class CuSpMatrix : public CuPackedMatrix { inline SpMatrix &Mat() { return *(reinterpret_cast* >(this)); } - }; template diff --git a/src/cudamatrix/cu-sparse-matrix-test.cc b/src/cudamatrix/cu-sparse-matrix-test.cc index a94f4685928..726b6e5ccd8 100644 --- a/src/cudamatrix/cu-sparse-matrix-test.cc +++ b/src/cudamatrix/cu-sparse-matrix-test.cc @@ -77,7 +77,7 @@ static void UnitTestCuSparseMatrixSum() { Real sum1 = cu_smat.Sum(); Real sum2 = mat.Sum(); - AssertEqual(sum1, sum2, 0.00001); + KALDI_ASSERT(fabs(sum1 - sum2) < 1.0e-05); } } diff --git a/src/cudamatrix/cu-sparse-matrix.cc b/src/cudamatrix/cu-sparse-matrix.cc index e4808615728..17d69ce849a 100644 --- a/src/cudamatrix/cu-sparse-matrix.cc +++ b/src/cudamatrix/cu-sparse-matrix.cc @@ -21,7 +21,7 @@ #if HAVE_CUDA == 1 #include -#include +#include #endif #include @@ -55,9 +55,12 @@ MatrixIndexT CuSparseMatrix::NumElements() const { template Real CuSparseMatrix::Sum() const { + if (NumElements() == 0) + return 0.0; #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { - CuVector sum_vec(*this); + CuVector sum_vec(this->NumElements(), kUndefined); + this->CopyElementsToVec(&sum_vec); return sum_vec.Sum(); } else #endif @@ -70,7 +73,8 @@ template Real CuSparseMatrix::FrobeniusNorm() const { #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { - CuVector element_vec(*this); + CuVector element_vec(this->NumElements(), kUndefined); + this->CopyElementsToVec(&element_vec); return element_vec.Norm(2); } else #endif @@ -202,6 +206,27 @@ void CuSparseMatrix::CopyToSmat(SparseMatrix *smat) const; template void CuSparseMatrix::CopyToSmat(SparseMatrix *smat) const; +template +void CuSparseMatrix::CopyElementsToVec(CuVectorBase *vec) const { + KALDI_ASSERT(vec != NULL); + KALDI_ASSERT(this->NumElements() == vec->Dim()); +#if HAVE_CUDA == 1 + if (CuDevice::Instantiate().Enabled()) { + Timer tim; + cublas_copy(GetCublasHandle(), + this->NumElements(), + &(this->elements_.Data()->weight), + static_cast(sizeof(MatrixElement) / sizeof(Real)), + vec->Data(), 1); + CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); + } else +#endif + { + Vector tmp(this->NumElements(), kUndefined); + Mat().CopyElementsToVec(&tmp); + vec->CopyFromVec(tmp); + } +} template void CuSparseMatrix::Swap(SparseMatrix *smat) { @@ -341,6 +366,7 @@ void GeneralMatrix::CopyToMat(CuMatrixBase *cu_mat, Matrix mat(cmat_); if (trans == kNoTrans) { cu_mat->CopyFromMat(mat); + break; } else { CuMatrix temp_cu; temp_cu.Swap(&mat); diff --git a/src/cudamatrix/cu-sparse-matrix.h b/src/cudamatrix/cu-sparse-matrix.h index ff2ba238414..1298ee5ea5f 100644 --- a/src/cudamatrix/cu-sparse-matrix.h +++ b/src/cudamatrix/cu-sparse-matrix.h @@ -95,6 +95,10 @@ class CuSparseMatrix { template void CopyToSmat(SparseMatrix *smat) const; + /// Copy elements to CuVector. It is the caller's responsibility to resize + /// <*vec>. + void CopyElementsToVec(CuVectorBase *vec) const; + /// Swap with CPU-based matrix. void Swap(SparseMatrix *smat); diff --git a/src/cudamatrix/cu-tp-matrix-test.cc b/src/cudamatrix/cu-tp-matrix-test.cc index 87203ea3a65..e9d1d66aad9 100644 --- a/src/cudamatrix/cu-tp-matrix-test.cc +++ b/src/cudamatrix/cu-tp-matrix-test.cc @@ -1,4 +1,4 @@ -// cudamatrix/cu-sp-matrix-test.cc +// cudamatrix/cu-tp-matrix-test.cc // // Copyright 2013 Ehsan Variani // Lucas Ondel diff --git a/src/cudamatrix/cu-tp-matrix.cc b/src/cudamatrix/cu-tp-matrix.cc index efc12df2bfb..4c3d32d2468 100644 --- a/src/cudamatrix/cu-tp-matrix.cc +++ b/src/cudamatrix/cu-tp-matrix.cc @@ -20,7 +20,7 @@ #if HAVE_CUDA==1 #include -#include +#include #endif #include "base/timer.h" @@ -74,9 +74,8 @@ void CuTpMatrix::Invert() { CU_SAFE_CALL(cudaGetLastError()); CuMatrix tmp2(dim, dim); tmp2.CopyFromTp(*this); - cublas_trsm(dim, dim, alpha, tmp2.Data(), tmp2.Dim().stride, - tmp.Data(), tmp.Dim().stride); - CU_SAFE_CALL(cudaGetLastError()); + CU_SAFE_CALL(cublas_trsm(GetCublasHandle(), dim, dim, alpha, tmp2.Data(), tmp2.Dim().stride, + tmp.Data(), tmp.Dim().stride)); this->CopyFromMat(tmp, kNoTrans); } else #endif diff --git a/src/cudamatrix/cu-tp-matrix.h b/src/cudamatrix/cu-tp-matrix.h index 1b74dd98470..8de46ec46f5 100644 --- a/src/cudamatrix/cu-tp-matrix.h +++ b/src/cudamatrix/cu-tp-matrix.h @@ -83,7 +83,6 @@ class CuTpMatrix : public CuPackedMatrix { inline TpMatrix &Mat() { return *(reinterpret_cast* >(this)); } - }; } // namespace diff --git a/src/cudamatrix/cu-vector-speed-test.cc b/src/cudamatrix/cu-vector-speed-test.cc index b42a04204f2..1e3b46a4ac7 100644 --- a/src/cudamatrix/cu-vector-speed-test.cc +++ b/src/cudamatrix/cu-vector-speed-test.cc @@ -39,7 +39,7 @@ std::string NameOf() { } template void TestCuVectorSoftmax(int32 dim) { - BaseFloat time_in_secs = 0.05; + BaseFloat time_in_secs = 0.02; CuVector M(dim); M.SetRandn(); @@ -57,7 +57,7 @@ template void TestCuVectorSoftmax(int32 dim) { template void TestCuVectorSum(int32 dim) { - BaseFloat time_in_secs = 0.05; + BaseFloat time_in_secs = 0.02; CuVector M(dim); M.SetRandn(); @@ -75,7 +75,7 @@ template void TestCuVectorSum(int32 dim) { template void TestCuVectorVecVecOne(int32 dim) { - BaseFloat time_in_secs = 0.05; + BaseFloat time_in_secs = 0.02; CuVector M(dim); M.SetRandn(); @@ -99,7 +99,7 @@ template void TestCuVectorVecVecOne(int32 dim) { template void TestCuVectorAddDiagMatMat(int32 dim, MatrixTransposeType transN, MatrixTransposeType transO) { - BaseFloat time_in_secs = 0.05; + BaseFloat time_in_secs = 0.02; CuVector v(dim); v.SetRandn(); CuMatrix N(dim, dim), O(dim, dim); @@ -108,7 +108,7 @@ template void TestCuVectorAddDiagMatMat(int32 dim, Timer tim; int32 iter = 0; - + for (;tim.Elapsed() < time_in_secs; iter++) { v.AddDiagMatMat(1.0, N, transN, O, transO, 1.0); } @@ -123,7 +123,7 @@ template void TestCuVectorAddDiagMatMat(int32 dim, template void TestCuVectorAddDiagMat2(int32 dim, MatrixTransposeType trans) { - BaseFloat time_in_secs = 0.05; + BaseFloat time_in_secs = 0.02; CuVector v(dim); v.SetRandn(); CuMatrix N(dim, dim); @@ -131,7 +131,7 @@ template void TestCuVectorAddDiagMat2(int32 dim, MatrixTransposeT Timer tim; int32 iter = 0; - + for (;tim.Elapsed() < time_in_secs; iter++) { v.AddDiagMat2(1.0, N, trans, 0.0); } @@ -209,7 +209,7 @@ template void CudaVectorSpeedTest() { TestCuVectorAddDiagMatMat(sizes[s], kTrans, kNoTrans); TestCuVectorAddDiagMatMat(sizes[s], kTrans, kTrans); } - for (int32 s = 0; s < ns; s++) { + for (int32 s = 0; s < ns; s++) { TestCuVectorAddDiagMat2(sizes[s], kNoTrans); TestCuVectorAddDiagMat2(sizes[s], kTrans); } @@ -221,7 +221,7 @@ template void CudaVectorSpeedTest() { TestCuVectorAddColSumMat(sizes[s], kNoTrans); TestCuVectorAddColSumMat(sizes[s], kTrans); } - + } diff --git a/src/cudamatrix/cu-vector-test.cc b/src/cudamatrix/cu-vector-test.cc index db715d75d7a..9b7aa97776a 100644 --- a/src/cudamatrix/cu-vector-test.cc +++ b/src/cudamatrix/cu-vector-test.cc @@ -1,4 +1,4 @@ -// cudamatrix/cuda-vector-test.cc +// cudamatrix/cu-vector-test.cc // Copyright 2013 Lucas Ondel // 2013 Johns Hopkins University (author: Daniel Povey) @@ -22,7 +22,7 @@ #include #include #include - +#include #include "base/kaldi-common.h" #include "util/common-utils.h" #include "cudamatrix/cu-matrix.h" @@ -62,7 +62,7 @@ static void UnitTestCuVectorIO() { } -template +template static void UnitTestCuVectorCopyFromVec() { for (int32 i = 1; i < 10; i++) { MatrixIndexT dim = 10 * i; @@ -80,7 +80,7 @@ static void UnitTestCuVectorCopyFromVec() { } } -template +template static void UnitTestCuSubVector() { for (int32 iter = 0 ; iter < 10; iter++) { int32 M1 = 1 + rand () % 10, M2 = 1 + Rand() % 1, M3 = 1 + Rand() % 10, M = M1 + M2 + M3, @@ -97,7 +97,7 @@ static void UnitTestCuSubVector() { -template +template static void UnitTestCuVectorMulTp() { for (int32 i = 1; i < 10; i++) { MatrixIndexT dim = 10 * i; @@ -105,7 +105,7 @@ static void UnitTestCuVectorMulTp() { A.SetRandn(); TpMatrix B(dim); B.SetRandn(); - + CuVector C(A); CuTpMatrix D(B); @@ -127,10 +127,10 @@ static void UnitTestCuVectorAddTp() { B.SetRandn(); Vector C(dim); C.SetRandn(); - + CuVector D(A); CuTpMatrix E(B); - CuVector F(C); + CuVector F(C); A.AddTpVec(1.0, B, kNoTrans, C, 1.0); D.AddTpVec(1.0, E, kNoTrans, F, 1.0); @@ -160,7 +160,7 @@ template void CuVectorUnitTestAddVec() { CuVector vec1_orig(vec1); BaseFloat alpha = 0.43243; vec1.AddVec(alpha, vec2); - + for (int32 i = 0; i < M; i++) AssertEqual(vec1_orig(i) + alpha * vec2(i), vec1(i)); } @@ -177,7 +177,7 @@ template void CuVectorUnitTestAddVecCross() { CuVector vec1_orig(vec1); Real alpha = 0.43243; vec1.AddVec(alpha, vec2); - + for (int32 i = 0; i < M; i++) AssertEqual(vec1_orig(i) + alpha * vec2(i), vec1(i)); } else { @@ -198,7 +198,7 @@ template void CuVectorUnitTestAddVecExtra() { CuVector vec1_orig(vec1); BaseFloat alpha = 0.43243, beta = 1.4321; vec1.AddVec(alpha, vec2, beta); - + for (int32 i = 0; i < M; i++) AssertEqual(beta * vec1_orig(i) + alpha * vec2(i), vec1(i)); } @@ -268,6 +268,20 @@ template static void UnitTestCuVectorReplaceValue() { } } +template static void UnitTestCuVectorSum() { + for (int32 i = 0; i < 200; i++) { + int32 start_dim = RandInt(1, 500), end_dim = RandInt(1, 500); + int32 dim = RandInt(10, 12000) + start_dim + end_dim; + Real quiet_nan = nan(""); // this is from . + Vector vec(start_dim + dim + end_dim); + vec.Range(0, start_dim).Set(quiet_nan); + vec.Range(start_dim, dim).Set(1.0); + vec.Range(start_dim + dim, end_dim).Set(quiet_nan); + BaseFloat sum = vec.Range(start_dim, dim).Sum(); + KALDI_ASSERT(ApproxEqual(sum, dim)); + } +} + template void CuVectorUnitTestInvertElements() { // Also tests MulElements(); int32 M = 256 + Rand() % 100; @@ -288,7 +302,7 @@ template void CuVectorUnitTestSum() { CuVector A(dim), ones(dim); A.SetRandn(); ones.Set(1.0); - + AssertEqual(VecVec(A, ones), A.Sum()); } } @@ -320,7 +334,7 @@ template void CuVectorUnitTestCopyFromMat() { } Matrix matrix(cu_matrix), matrix2(M, N); CuMatrix matrix3(M, N); - + CuVector vector(M * N), vector2(M * N); vector.CopyRowsFromMat(cu_matrix); vector2.CopyRowsFromMat(matrix); @@ -328,8 +342,8 @@ template void CuVectorUnitTestCopyFromMat() { matrix3.CopyRowsFromVec(Vector(vector2)); Vector vector3(M * N); vector3.CopyRowsFromMat(cu_matrix); - - + + for(int32 j = 0; j < M*N; j++) { if (Rand() % 500 == 0) { // random small subset (it was slow) KALDI_ASSERT(vector(j) == cu_matrix(j/N, j%N)); @@ -412,7 +426,7 @@ template void CuVectorUnitTestNorm() { KALDI_ASSERT(ApproxEqual(cu_vector.Norm(1.0), 3.0)); KALDI_ASSERT(ApproxEqual(cu_vector.Norm(2.0), sqrt(5.0))); } - + template void CuVectorUnitTestMin() { for (int32 p = 0; p < 5; p++) { @@ -496,7 +510,7 @@ template void CuVectorUnitTestApplyFloor() { BaseFloat floor = 0.33 * (-5 + Rand() % 10); int32 i = cu_vector.ApplyFloor(floor); int32 j = vector.ApplyFloor(floor); - + CuVector cu2(vector); AssertEqual(cu2, cu_vector); @@ -507,6 +521,27 @@ template void CuVectorUnitTestApplyFloor() { } } +template void CuVectorUnitTestApplyCeiling() { + for (int32 l = 0; l < 10; l++) { + int32 dim = 100 + Rand() % 700; + CuVector cu_vector(dim); + cu_vector.SetRandn(); + + Vector vector(cu_vector); + BaseFloat floor = 0.33 * (-5 + Rand() % 10); + int32 i = cu_vector.ApplyCeiling(floor); + int32 j = vector.ApplyCeiling(floor); + + CuVector cu2(vector); + + AssertEqual(cu2, cu_vector); + if (i != j) { + KALDI_WARN << "ApplyCeiling return code broken..."; + } + KALDI_ASSERT(i==j); + } +} + template void CuVectorUnitTestApplyPow() { for (int32 l = 0; l < 10; l++) { int32 dim = 100 + Rand() % 700; @@ -519,7 +554,7 @@ template void CuVectorUnitTestApplyPow() { BaseFloat pow = -2 + (Rand() % 5); cu_vector.ApplyPow(pow); vector.ApplyPow(pow); - + CuVector cu2(vector); AssertEqual(cu2, cu_vector); @@ -558,7 +593,7 @@ template void CuVectorUnitTestAddDiagMat2() { cu_mat_orig.SetRandn(); MatrixTransposeType trans = (p % 2 == 0 ? kNoTrans : kTrans); CuMatrix cu_mat(cu_mat_orig, trans); - + Vector vector(cu_vector); Matrix mat(cu_mat); @@ -583,12 +618,12 @@ static void CuVectorUnitTestAddDiagMatMat() { MatrixTransposeType transM = (iter % 2 == 0 ? kNoTrans : kTrans); MatrixTransposeType transN = ((iter/2) % 2 == 0 ? kNoTrans : kTrans); CuMatrix M(M_orig, transM), N(N_orig, transN); - + v.SetRandn(); CuVector w(v); w.AddDiagMatMat(alpha, M, transM, N, transN, beta); - + { CuVector w2(v); CuMatrix MN(dimM, dimM); @@ -648,7 +683,7 @@ template void CuVectorUnitTestAddSpVec() { CuSpMatrix mat_cu(M); mat_cu.SetRandn(); SpMatrix mat(mat_cu); - + BaseFloat alpha = 0.5 * (Rand() % 5), beta = 0.5 * (Rand() % 5); dst_cu.AddSpVec(alpha, mat_cu, src_cu, beta); dst.AddSpVec(alpha, mat, src, beta); @@ -674,6 +709,7 @@ template void CuVectorUnitTest() { CuVectorUnitTestScale(); CuVectorUnitTestSum(); CuVectorUnitTestInvertElements(); + UnitTestCuVectorSum(); CuVectorUnitTestAddRowSumMat(); CuVectorUnitTestAddColSumMat(); UnitTestCuVectorReplaceValue(); @@ -687,11 +723,12 @@ template void CuVectorUnitTest() { CuVectorUnitTestCopyDiagFromPacked(); CuVectorUnitTestCopyDiagFromMat(); CuVectorUnitTestCopyCross(); - CuVectorUnitTestCopyCross2(); - CuVectorUnitTestNorm(); + CuVectorUnitTestCopyCross2(); + CuVectorUnitTestNorm(); CuVectorUnitTestApplyExp(); CuVectorUnitTestApplyLog(); CuVectorUnitTestApplyFloor(); + CuVectorUnitTestApplyCeiling(); CuVectorUnitTestApplyPow(); CuVectorUnitTestAddMatVec(); CuVectorUnitTestAddSpVec(); @@ -710,10 +747,10 @@ int main(int argc, char *argv[]) { const char *usage = "Usage: cu-vector-test [options]"; ParseOptions po(usage); - std::string use_gpu = "yes"; + std::string use_gpu = "yes"; po.Register("use-gpu", &use_gpu, "yes|no|optional"); po.Read(argc, argv); - + if (po.NumArgs() != 0) { po.PrintUsage(); exit(1); diff --git a/src/cudamatrix/cu-vector.cc b/src/cudamatrix/cu-vector.cc index 16b554cab9a..6deb3809d85 100644 --- a/src/cudamatrix/cu-vector.cc +++ b/src/cudamatrix/cu-vector.cc @@ -20,7 +20,7 @@ #if HAVE_CUDA == 1 #include -#include +#include #endif #include "base/timer.h" @@ -48,12 +48,10 @@ Real VecVec(const CuVectorBase &a, KALDI_ASSERT(a.Dim() == b.Dim()); Real result = 0; #if HAVE_CUDA == 1 - if (CuDevice::Instantiate().Enabled()) { + if (CuDevice::Instantiate().Enabled()) { Timer tim; - - result = cublas_dot(a.Dim(), a.Data(), 1, b.Data(), 1); - - CU_SAFE_CALL(cublasGetError()); + CU_SAFE_CALL(cublas_dot(GetCublasHandle(), a.Dim(), a.Data(), 1, b.Data(), + 1, &result)); CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); } else #endif @@ -85,11 +83,9 @@ void CuVectorBase::CopyColFromMat(const CuMatrixBase &mat, MatrixInd #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { Timer tim; - int dimBlock(CU1DBLOCK); - int dimGrid(n_blocks(dim_,CU1DBLOCK)); - - cuda_copy_col_from_mat(dimGrid, dimBlock, data_, col, mat.Data(), mat.Dim(), dim_); - CU_SAFE_CALL(cudaGetLastError()); + cublas_copy(GetCublasHandle(), + this->dim_, mat.Data() + col, mat.Stride(), this->data_, 1); + CU_SAFE_CALL(cudaGetLastError()); CuDevice::Instantiate().AccuProfile("CuVectorBase::CopyColFromMat", tim.Elapsed()); } else #endif @@ -110,7 +106,7 @@ void CuVectorBase::CopyColFromMat(const CuMatrixBase &mat, Matrix int dimGrid(n_blocks(dim_,CU1DBLOCK)); cuda_copy_col_from_mat_df(dimGrid, dimBlock, data_, col, mat.Data(), mat.Dim(), dim_); - CU_SAFE_CALL(cudaGetLastError()); + CU_SAFE_CALL(cudaGetLastError()); CuDevice::Instantiate().AccuProfile("CuVectorBase::CopyColFromMat", tim.Elapsed()); } else #endif @@ -132,8 +128,8 @@ void CuVectorBase::CopyColFromMat(const CuMatrixBase &mat, Matrix int dimGrid(n_blocks(dim_,CU1DBLOCK)); cuda_copy_col_from_mat_fd(dimGrid, dimBlock, data_, col, mat.Data(), mat.Dim(), dim_); - CU_SAFE_CALL(cudaGetLastError()); - CuDevice::Instantiate().AccuProfile("CuVectorBase::CopyColFromMat", tim.Elapsed()); + CU_SAFE_CALL(cudaGetLastError()); + CuDevice::Instantiate().AccuProfile("CuVectorBase::CopyColFromMat", tim.Elapsed()); } else #endif { @@ -143,7 +139,7 @@ void CuVectorBase::CopyColFromMat(const CuMatrixBase &mat, Matrix template void CuVectorBase::CopyRowsFromMat(const CuMatrixBase &mat) { - KALDI_ASSERT(dim_ == mat.NumCols() * mat.NumRows()); + KALDI_ASSERT(dim_ == mat.NumCols() * mat.NumRows()); #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { if (dim_ == 0) return; @@ -177,9 +173,9 @@ Real CuVectorBase::Norm(Real p) { KALDI_ASSERT(p == 1.0 || p == 2.0); if (dim_ == 0) return 0.0; if (p == 1.0) { - ans = cublas_asum(dim_, data_, 1); + cublas_asum(GetCublasHandle(), dim_, data_, 1, &ans); } else { - ans = cublas_nrm2(dim_, data_, 1); + cublas_nrm2(GetCublasHandle(), dim_, data_, 1, &ans); } CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); if (ans != ans) { @@ -248,7 +244,7 @@ void MatrixBase::CopyRowsFromVec(const CuVectorBase &v) { CopyRowsFromVec(v.Vec()); } } - + // instantiate the template above. template void MatrixBase::CopyRowsFromVec(const CuVectorBase &v); template void MatrixBase::CopyRowsFromVec(const CuVectorBase &v); @@ -280,10 +276,9 @@ Real CuVectorBase::Sum() const { CU_SAFE_CALL(cudaGetLastError()); Vector tmp(dimGrid); g.CopyToVec(&tmp); - CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); + CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); return tmp.Sum(); } else { - if (dim_ == 0) return 0.0; CuVector tmp(1, kUndefined); int dimBlock(CU1DBLOCK); int dimGrid = 1; // only 1 block here. we have loops in each thread. @@ -306,7 +301,7 @@ void CuVectorBase::ApplySoftMax() { if (dim_ == 0) return; Timer tim; size_t dimBlock = dim_ > CU1DBLOCK ? CU1DBLOCK : dim_; // for cuda_softmax_reduce function, dimBlock value is fixed min(CU1DBLOCK, dim) , represent CU1DBLOCK threads reduce a row at the same time. - size_t dimGrid = 1; // dimGrid value represent the number of rows + size_t dimGrid = 1; // dimGrid value represent the number of rows ::MatrixDim dim = { 1, this->dim_, this->dim_}; cuda_softmax_reduce(dimGrid, dimBlock, data_, data_, dim, this->dim_);//actually dim is not stride... CU_SAFE_CALL(cudaGetLastError()); @@ -329,9 +324,9 @@ MatrixIndexT CuVectorBase::ApplyFloor(Real floor_val) { int dimGrid(n_blocks(dim_,CU1DBLOCK)); CuVector count_vec(dim_, kUndefined); - + cuda_vec_apply_floor(dimGrid, dimBlock, data_, floor_val, count_vec.Data(), dim_); - CU_SAFE_CALL(cudaGetLastError()); + CU_SAFE_CALL(cudaGetLastError()); num_floored = count_vec.Sum(); CuDevice::Instantiate().AccuProfile("CuVectorBase::ApplyFloor", tim.Elapsed()); } else @@ -344,22 +339,27 @@ MatrixIndexT CuVectorBase::ApplyFloor(Real floor_val) { } template -void CuVectorBase::ApplyCeiling(Real ceiling_val) { +MatrixIndexT CuVectorBase::ApplyCeiling(Real ceiling_val) { + MatrixIndexT num_ceiled = 0; #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { - if (dim_ == 0) return; + if (dim_ == 0) return 0; Timer tim; - dim3 dimBlock(CU1DBLOCK, 1); - dim3 dimGrid(n_blocks(Dim(), CU1DBLOCK), 1); - MatrixDim pseudo_matrix_dim = { 1, Dim(), Dim() }; // vector is a matix with 1 row, - cuda_apply_ceiling(dimGrid, dimBlock, data_, ceiling_val, pseudo_matrix_dim); - CU_SAFE_CALL(cudaGetLastError()); - CuDevice::Instantiate().AccuProfile("CuVectorBase::ApplyCeiling", tim.Elapsed()); + int dimBlock(CU1DBLOCK); + int dimGrid(n_blocks(dim_,CU1DBLOCK)); + + CuVector count_vec(dim_, kUndefined); + + cuda_vec_apply_ceiling(dimGrid, dimBlock, data_, ceiling_val, count_vec.Data(), dim_); + CU_SAFE_CALL(cudaGetLastError()); + num_ceiled = count_vec.Sum(); + CuDevice::Instantiate().AccuProfile("CuVectorBase::ApplyFloor", tim.Elapsed()); } else #endif { - Vec().ApplyCeiling(ceiling_val); + num_ceiled = Vec().ApplyCeiling(ceiling_val); } + return num_ceiled; } template @@ -370,12 +370,12 @@ void CuVectorBase::ApplyPow(Real power) { Timer tim; // for this particular kernel, x is #rows, y is #cols. so // fake matrix with 1 row, Dim() cols. - dim3 dimBlock(1, CU1DBLOCK); - dim3 dimGrid(1, n_blocks(Dim(), CU1DBLOCK)); + dim3 dimBlock(CU1DBLOCK, 1); + dim3 dimGrid(n_blocks(Dim(), CU1DBLOCK), 1); ::MatrixDim fake_matrix_dim = { 1, Dim(), 1 }; // num_cols is Dim(), num_rows is 1, stride is 1 (it's a don't-care). cuda_apply_pow(dimGrid, dimBlock, data_, power, fake_matrix_dim); - CU_SAFE_CALL(cudaGetLastError()); + CU_SAFE_CALL(cudaGetLastError()); CuDevice::Instantiate().AccuProfile("CuVectorBase::ApplyFloor", tim.Elapsed()); } else #endif @@ -395,7 +395,7 @@ void CuVectorBase::ApplyExp() { int dimGrid(n_blocks(dim_,CU1DBLOCK)); cuda_vec_apply_exp(dimGrid, dimBlock, data_, dim_); - CU_SAFE_CALL(cudaGetLastError()); + CU_SAFE_CALL(cudaGetLastError()); CuDevice::Instantiate().AccuProfile("CuVectorBase::ApplyExp", tim.Elapsed()); } else #endif @@ -416,7 +416,7 @@ void CuVectorBase::ApplyLog() { CuVector flag(1); cuda_vec_apply_log(dimGrid, dimBlock, data_, flag.Data(), dim_); - CU_SAFE_CALL(cudaGetLastError()); + CU_SAFE_CALL(cudaGetLastError()); if (flag(0) > 0) KALDI_ERR << "Trying to take log of a negative number."; CuDevice::Instantiate().AccuProfile("CuVectorBase::ApplyLog", tim.Elapsed()); @@ -439,15 +439,16 @@ void CuVectorBase::AddMatVec(const Real alpha, KALDI_ASSERT(&v != this); #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { - if (dim_ == 0) return; + if (dim_ == 0) return; Timer tim; // Everything is backwards in CuBlas. We need to reverse rows, columns, // transpose-ness. - cublas_gemv((trans==kTrans?'N':'T'), M.NumCols(), M.NumRows(), alpha, - M.Data(), M.Stride(), v.Data(), 1, beta, data_, 1); + CU_SAFE_CALL(cublas_gemv(GetCublasHandle(), + (trans==kTrans? CUBLAS_OP_N:CUBLAS_OP_T), + M.NumCols(), M.NumRows(), alpha, M.Data(), + M.Stride(), v.Data(), 1, beta, data_, 1)); - CU_SAFE_CALL(cublasGetError()); CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); } else #endif @@ -470,9 +471,9 @@ void CuVectorBase::AddSpVec(const Real alpha, // Note: in our opinion the CuSpMatrix represents a lower-triangular matrix, but // in CUBLAS, for some stupid reason, everything is reversed. - cublas_spmv('U', Dim(), alpha, M.Data(), v.Data(), 1, beta, data_, 1); + CU_SAFE_CALL(cublas_spmv(GetCublasHandle(), CUBLAS_FILL_MODE_UPPER, Dim(), + alpha, M.Data(), v.Data(), 1, beta, data_, 1)); - CU_SAFE_CALL(cublasGetError()); CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); } else #endif @@ -494,7 +495,7 @@ void CuVectorBase::AddVecVec(Real alpha, const CuVectorBase &v, int dimGrid(n_blocks(dim_,CU1DBLOCK)); cuda_add_vec_vec(dimGrid, dimBlock, alpha, data_, v.Data(), r.Data(), beta, dim_); - CU_SAFE_CALL(cudaGetLastError()); + CU_SAFE_CALL(cudaGetLastError()); CuDevice::Instantiate().AccuProfile("CuVectorBase::AddVecVec", tim.Elapsed()); } else #endif @@ -529,7 +530,7 @@ void CuVectorBase::AddDiagMat2(Real alpha, const CuMatrixBase &M, #endif { Vec().AddDiagMat2(alpha, M.Mat(), trans, beta); - } + } } template @@ -562,19 +563,19 @@ void CuVectorBase::AddDiagMatMat( int dimGridLimit = (transM == kNoTrans && transN == kTrans ? 2048 : (transM == kTrans && transN == kNoTrans ? 16 : 32)); - + while (M_col_dim > 10 * threads_per_element && dimGrid < dimGridLimit && threads_per_element < 256) { threads_per_element *= 2; dimGrid = n_blocks(dim * threads_per_element, CU1DBLOCK); } - + cuda_add_diag_mat_mat(dimGrid, dimBlock, alpha, data_, dim, M.Data(), M_col_dim, M_row_stride, M_col_stride, N.Data(), N_row_stride, N_col_stride, threads_per_element, beta); CU_SAFE_CALL(cudaGetLastError()); - CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); + CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); } else #endif { @@ -590,7 +591,7 @@ void CuVectorBase::AddTpVec(const Real alpha, const CuTpMatrix &M, KALDI_ASSERT(dim_ == v.dim_ && dim_ == M.NumRows()); #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { - if (dim_ == 0) return; + if (dim_ == 0) return; Timer tim; if (beta == 0.0) { if (&v != this) CopyFromVec(v); @@ -601,7 +602,7 @@ void CuVectorBase::AddTpVec(const Real alpha, const CuTpMatrix &M, tmp.MulTp(M, trans); if (beta != 1.0) Scale(beta); // *this <-- beta * *this AddVec(alpha, tmp, 1.0); // *this += alpha * M * v - } + } } else #endif { @@ -617,8 +618,9 @@ void CuVectorBase::MulTp(const CuTpMatrix &M, const MatrixTransposeT if (CuDevice::Instantiate().Enabled()) { if (dim_ == 0) return; Timer tim; - cublas_tpmv((trans==kTrans?'N':'T'), M.NumRows(), M.Data(), data_, 1); - CuDevice::Instantiate().AccuProfile("CuVectorBase::MulTp", tim.Elapsed()); + cublas_tpmv(GetCublasHandle(), (trans==kTrans? CUBLAS_OP_N:CUBLAS_OP_T), + M.NumRows(), M.Data(), data_, 1); + CuDevice::Instantiate().AccuProfile("CuVectorBase::MulTp", tim.Elapsed()); } else #endif { @@ -655,11 +657,11 @@ Real CuVectorBase::Max() const { if (CuDevice::Instantiate().Enabled()) { if (dim_ == 0) { // max of an empty set is -infinity. return -std::numeric_limits::infinity(); - } + } Timer tim; CuVector ans(1); cuda_vec_max(data_, ans.Data(), dim_); - CU_SAFE_CALL(cudaGetLastError()); + CU_SAFE_CALL(cudaGetLastError()); result = ans(0); CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); } else @@ -670,9 +672,9 @@ Real CuVectorBase::Max() const { return result; } -template +template void CuVectorBase::ReplaceValue(Real orig, Real changed) { -#if HAVE_CUDA == 1 +#if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { if (dim_ == 0) return; Timer tim; @@ -698,7 +700,7 @@ void CuVectorBase::MulElements(const CuVectorBase &v) { int dimBlock(CU1DBLOCK); int dimGrid(n_blocks(dim_, CU1DBLOCK)); cuda_vec_mul_elements(dimGrid, dimBlock, data_, v.Data(), dim_); - CU_SAFE_CALL(cudaGetLastError()); + CU_SAFE_CALL(cudaGetLastError()); CuDevice::Instantiate().AccuProfile("CuVectorBase::MulElements", tim.Elapsed()); } else #endif @@ -719,7 +721,7 @@ void CuVectorBase::CopyFromVec(const CuVectorBase &src) { int dimGrid(n_blocks(dim_, CU2DBLOCK)); cuda_copy_from_vec_df(dimGrid, dimBlock, data_, src.data_, dim_); CU_SAFE_CALL(cudaGetLastError()); - CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); + CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); } else #endif { @@ -752,14 +754,14 @@ template template void CuVectorBase::CopyFromVec(const VectorBase &src) { #if HAVE_CUDA == 1 - if (CuDevice::Instantiate().Enabled()) { + if (CuDevice::Instantiate().Enabled()) { if (sizeof(Real) != sizeof(OtherReal)) { CuVector temp(dim_, kUndefined); temp.CopyFromVec(src); this->CopyFromVec(temp); } else { KALDI_ASSERT(src.Dim() == dim_); - if (dim_ == 0) return; + if (dim_ == 0) return; Timer tim; CU_SAFE_CALL(cudaMemcpy(data_, src.Data(), src.Dim()*sizeof(Real), cudaMemcpyHostToDevice)); CuDevice::Instantiate().AccuProfile("CuVector::CopyFromVecH2D",tim.Elapsed()); @@ -780,34 +782,6 @@ void CuVectorBase::CopyFromVec(const VectorBase &src); template void CuVectorBase::CopyFromVec(const VectorBase &src); -template -template -void CuVectorBase::CopyFromSmat(const CuSparseMatrix &smat) { - KALDI_ASSERT(dim_ == smat.NumElements()); -#if HAVE_CUDA == 1 - if (CuDevice::Instantiate().Enabled()) { - Timer tim; - dim3 dimBlock(CU1DBLOCK, 1); - dim3 dimGrid(n_blocks(smat.NumElements(), CU1DBLOCK), 1); - cuda_copy_from_smat_as_vec(dimGrid, dimBlock, this->data_, - smat.Data(), smat.NumElements()); - CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); - } else -#endif - { - Vector tmp(smat.Mat()); - this->CopyFromVec(tmp); - } -} -template -void CuVectorBase::CopyFromSmat(const CuSparseMatrix &smat); -template -void CuVectorBase::CopyFromSmat(const CuSparseMatrix &smat); -template -void CuVectorBase::CopyFromSmat(const CuSparseMatrix &smat); -template -void CuVectorBase::CopyFromSmat(const CuSparseMatrix &smat); - template template void CuVectorBase::CopyToVec(VectorBase *dst) const { @@ -879,18 +853,18 @@ void CuVector::Resize(MatrixIndexT dim, MatrixResizeType t) { this->data_ = static_cast(CuDevice::Instantiate().Malloc(dim * sizeof(Real))); this->dim_ = dim; if (t == kSetZero) this->SetZero(); - CuDevice::Instantiate().AccuProfile("CuVector::Resize", tim.Elapsed()); + CuDevice::Instantiate().AccuProfile("CuVector::Resize", tim.Elapsed()); } else #endif { Vector vec(dim); - this->Swap(&vec); + this->Swap(&vec); } } template void CuVector::Swap(Vector *vec) { -#if HAVE_CUDA == 1 +#if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { if (this->dim_ == 0) { if (vec->dim_ != 0) { @@ -927,7 +901,7 @@ void CuVector::Swap(Vector *vec) { template void CuVector::Destroy() { #if HAVE_CUDA == 1 - if (CuDevice::Instantiate().Enabled()) { + if (CuDevice::Instantiate().Enabled()) { if (this->data_ != NULL) CuDevice::Instantiate().Free(this->data_); } else @@ -962,7 +936,7 @@ template void CuVectorBase::SetZero() { if (dim_==0 || data_==NULL) return; #if HAVE_CUDA == 1 - if (CuDevice::Instantiate().Enabled()) { + if (CuDevice::Instantiate().Enabled()) { KALDI_ASSERT(dim_>=0); KALDI_ASSERT(data_!=NULL); Timer tim; @@ -997,13 +971,13 @@ std::ostream &operator << (std::ostream &out, const CuVectorBase &vec); template void CuVectorBase::Set(Real value) { #if HAVE_CUDA == 1 - if (CuDevice::Instantiate().Enabled()) { + if (CuDevice::Instantiate().Enabled()) { Timer tim; - + dim3 dimBlock(CU1DBLOCK); dim3 dimGrid(n_blocks(Dim(), CU1DBLOCK)); ::MatrixDim d = { 1, Dim(), Dim() }; - + cuda_set_const(dimGrid, dimBlock, data_, value, d); CU_SAFE_CALL(cudaGetLastError()); CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); @@ -1019,7 +993,7 @@ void CuVectorBase::Set(Real value) { template void CuVectorBase::Add(Real value) { #if HAVE_CUDA == 1 - if (CuDevice::Instantiate().Enabled()) { + if (CuDevice::Instantiate().Enabled()) { Timer tim; dim3 dimBlock(CU1DBLOCK); @@ -1063,8 +1037,9 @@ void CuVectorBase::CopyDiagFromMat(const CuMatrix &M) { if (CuDevice::Instantiate().Enabled()) { KALDI_ASSERT(dim_ == std::min(M.NumRows(), M.NumCols())); Timer tim; - cublas_copy(dim_, M.Data(), M.Stride() + 1, data_, 1); - CU_SAFE_CALL(cudaGetLastError()); + CU_SAFE_CALL(cublas_copy(GetCublasHandle(), dim_, M.Data(), M.Stride() + 1, + data_, 1)); + CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); } else #endif @@ -1101,14 +1076,13 @@ void CuVectorBase::AddVec(Real alpha, const CuVectorBase &vec, KALDI_ASSERT(vec.Dim() == Dim()); #if HAVE_CUDA == 1 - if (CuDevice::Instantiate().Enabled()) { + if (CuDevice::Instantiate().Enabled()) { Timer tim; int32 dim = this->dim_; Real *data = this->data_; const Real *vec_data = vec.data_; - if (beta != 1.0) cuda_scal(dim, beta, data, 1); - if (alpha != 0.0) cuda_axpy(dim, alpha, vec_data, 1, data, 1); - CU_SAFE_CALL(cudaGetLastError()); + if (beta != 1.0) CU_SAFE_CALL(cuda_scal(GetCublasHandle(), dim, beta, data, 1)); + if (alpha != 0.0) CU_SAFE_CALL(cuda_axpy(GetCublasHandle(), dim, alpha, vec_data, 1, data, 1)); CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); } else #endif @@ -1161,20 +1135,20 @@ void CuVectorBase::AddColSumMat(Real alpha, } - -template + +template void CuVectorBase::InvertElements() { #if HAVE_CUDA == 1 - if (CuDevice::Instantiate().Enabled()) { + if (CuDevice::Instantiate().Enabled()) { Timer tim; - + dim3 dimBlock(CU1DBLOCK, 1); dim3 dimGrid(n_blocks(dim_, CU1DBLOCK)); MatrixDim d = {1, dim_, dim_}; cuda_invert_elements(dimGrid, dimBlock, data_, d); CU_SAFE_CALL(cudaGetLastError()); - + CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); } else #endif diff --git a/src/cudamatrix/cu-vector.h b/src/cudamatrix/cu-vector.h index ed7dd5bdcb2..54c1ac0ad4f 100644 --- a/src/cudamatrix/cu-vector.h +++ b/src/cudamatrix/cu-vector.h @@ -82,9 +82,6 @@ class CuVectorBase { template void CopyFromVec(const VectorBase &src); - template - void CopyFromSmat(const CuSparseMatrix &smat); - template void CopyToVec(VectorBase *dst) const; @@ -125,7 +122,7 @@ class CuVectorBase { void ApplyExp(); void ApplyLog(); MatrixIndexT ApplyFloor(Real floor_val); - void ApplyCeiling(Real ceiling_val); + MatrixIndexT ApplyCeiling(Real ceiling_val); void ApplyPow(Real power); Real Sum() const; void SetRandn(); @@ -215,6 +212,7 @@ class CuVectorBase { Real *data_; ///< GPU data pointer (or regular data pointer ///< if CUDA is not compiled in or we have no GPU). MatrixIndexT dim_; ///< dimension of the vector + private: KALDI_DISALLOW_COPY_AND_ASSIGN(CuVectorBase); }; @@ -252,13 +250,6 @@ class CuVector: public CuVectorBase { this->CopyFromVec(Vector(v)); } - template - explicit CuVector(const CuSparseMatrix &smat) : - CuVectorBase () { - Resize(smat.NumElements(), kUndefined); - this->CopyFromSmat(smat); - } - /// Allocate the memory void Resize(MatrixIndexT dim, MatrixResizeType t = kSetZero); @@ -339,8 +330,8 @@ bool ApproxEqual(const CuVectorBase &a, } template -inline void AssertEqual(CuVectorBase &a, CuVectorBase &b, - float tol = 0.01) { +inline void AssertEqual(const CuVectorBase &a, + const CuVectorBase &b, Real tol = 0.01) { KALDI_ASSERT(a.ApproxEqual(b, tol)); } diff --git a/src/cudamatrix/cublas-wrappers.h b/src/cudamatrix/cublas-wrappers.h index cec9b1fe9ac..f1d018a248d 100644 --- a/src/cudamatrix/cublas-wrappers.h +++ b/src/cudamatrix/cublas-wrappers.h @@ -25,85 +25,126 @@ namespace kaldi { #if HAVE_CUDA == 1 -inline void cublas_gemm(char transa, char transb, int m, int n,int k, float alpha, const float *A, int lda,const float *B, int ldb, float beta, float *C, int ldc) { - cublasSgemm(transa,transb,m,n,k,alpha,A,lda,B,ldb,beta,C,ldc); -} -inline void cublas_gemm(char transa, char transb, int m, int n,int k, double alpha, const double *A, int lda,const double *B, int ldb, double beta, double *C, int ldc) { - cublasDgemm(transa,transb,m,n,k,alpha,A,lda,B,ldb,beta,C,ldc); -} -inline void cublas_trsm(int m, int n, float alpha, const float* A, int lda, float* B, int ldb) { - cublasStrsm('l','u','n','n',m,n,alpha,A,lda,B,ldb); -} -inline void cublas_trsm(int m, int n, double alpha, const double* A, int lda, double* B, int ldb) { - cublasDtrsm('l','u','n','n',m,n,alpha,A,lda,B,ldb); -} -inline void cublas_syrk(char uplo, char trans, int n, int k, - float alpha, const float *A, int lda, - float beta, float *C, int ldc) { - cublasSsyrk(uplo,trans,n,k,alpha,A,lda,beta,C,ldc); -} -inline void cublas_syrk(char uplo, char trans, int n, int k, - double alpha, const double *A, int lda, - double beta, double *C, int ldc) { - cublasDsyrk(uplo,trans,n,k,alpha,A,lda,beta,C,ldc); -} -inline float cublas_dot(int n, const float *x, int incx, const float *y, int incy) { - return cublasSdot(n, x, incx, y, incy); -} -inline double cublas_dot(int n, const double *x, int incx, const double *y, int incy) { - return cublasDdot(n, x, incx, y, incy); -} -inline float cublas_asum(int n, const float* x, int incx) { - return cublasSasum(n, x, incx); -} -inline double cublas_asum(int n, const double* x, int incx) { - return cublasDasum(n, x, incx); -} -inline float cublas_nrm2(int n, const float* x, int incx) { - return cublasSnrm2(n, x, incx); +inline cublasStatus_t cublas_gemm(cublasHandle_t handle, cublasOperation_t transa, + cublasOperation_t transb, int m, int n,int k, float alpha, + const float *A, int lda, const float *B, int ldb, float beta, + float *C, int ldc) { + return cublasSgemm_v2(handle,transa,transb,m,n,k,&alpha,A,lda,B,ldb,&beta,C,ldc); +} +inline cublasStatus_t cublas_gemm(cublasHandle_t handle, cublasOperation_t transa, + cublasOperation_t transb, int m, int n,int k, double alpha, + const double *A, int lda, const double *B, int ldb, double beta, + double *C, int ldc) { + return cublasDgemm_v2(handle,transa,transb,m,n,k,&alpha,A,lda,B,ldb,&beta,C,ldc); +} +inline cublasStatus_t cublas_ger(cublasHandle_t handle, int m, int n, float alpha, + const float *x, int incx, const float *y, int incy, float *A, int lda ) { + return cublasSger_v2(handle,m,n,&alpha,x,incx,y,incy,A,lda); +} +inline cublasStatus_t cublas_ger(cublasHandle_t handle, int m, int n, double alpha, + const double *x, int incx, const double *y, int incy, double *A, int lda ) { + return cublasDger_v2(handle,m,n,&alpha,x,incx,y,incy,A,lda); +} +inline cublasStatus_t cublas_gemmBatched(cublasHandle_t handle, cublasOperation_t transa, + cublasOperation_t transb, int m, int n, int k, float alpha, + const float *A[], int lda, const float *B[], int ldb, float beta, + float *C[], int ldc, int batchCount) { + return cublasSgemmBatched(handle, transa, transb, m, n, k, &alpha, A, lda, B, ldb, &beta, C, ldc, batchCount); +} +inline cublasStatus_t cublas_gemmBatched(cublasHandle_t handle, cublasOperation_t transa, + cublasOperation_t transb, int m, int n, int k, double alpha, + const double *A[], int lda, const double *B[], int ldb, double beta, + double *C[], int ldc, int batchCount) { + return cublasDgemmBatched(handle, transa, transb, m, n, k, &alpha, A, lda, B, ldb, &beta, C, ldc, batchCount); +} +inline cublasStatus_t cublas_trsm(cublasHandle_t handle, int m, int n, float alpha, + const float* A, int lda, float* B, int ldb) { + return cublasStrsm_v2(handle,CUBLAS_SIDE_LEFT,CUBLAS_FILL_MODE_UPPER,CUBLAS_OP_N,CUBLAS_DIAG_NON_UNIT,m,n,&alpha,A,lda,B,ldb); +} +inline cublasStatus_t cublas_trsm(cublasHandle_t handle, int m, int n, double alpha, + const double* A, int lda, double* B, int ldb) { + return cublasDtrsm_v2(handle,CUBLAS_SIDE_LEFT,CUBLAS_FILL_MODE_UPPER,CUBLAS_OP_N,CUBLAS_DIAG_NON_UNIT,m,n,&alpha,A,lda,B,ldb); +} +inline cublasStatus_t cublas_syrk(cublasHandle_t handle, cublasFillMode_t uplo, + cublasOperation_t trans, int n, int k, float alpha, + const float *A, int lda, float beta, float *C, int ldc) { + return cublasSsyrk_v2(handle,uplo,trans,n,k,&alpha,A,lda,&beta,C,ldc); +} +inline cublasStatus_t cublas_syrk(cublasHandle_t handle, cublasFillMode_t uplo, + cublasOperation_t trans, int n, int k, double alpha, + const double *A, int lda, double beta, double *C, int ldc) { + return cublasDsyrk_v2(handle,uplo,trans,n,k,&alpha,A,lda,&beta,C,ldc); +} +inline cublasStatus_t cublas_dot(cublasHandle_t handle, int n, const float *x, + int incx, const float *y, int incy, float *result) { + return cublasSdot_v2(handle, n, x, incx, y, incy, result); +} +inline cublasStatus_t cublas_dot(cublasHandle_t handle, int n, const double *x, + int incx, const double *y, int incy, double *result) { + return cublasDdot_v2(handle, n, x, incx, y, incy, result); +} +inline cublasStatus_t cublas_asum(cublasHandle_t handle, int n, const float* x, + int incx, float *result) { + return cublasSasum_v2(handle, n, x, incx, result); +} +inline cublasStatus_t cublas_asum(cublasHandle_t handle, int n, const double* x, + int incx, double *result) { + return cublasDasum_v2(handle, n, x, incx, result); +} +inline cublasStatus_t cublas_nrm2(cublasHandle_t handle, int n, const float* x, + int incx, float *result) { + return cublasSnrm2_v2(handle, n, x, incx, result); + } -inline double cublas_nrm2(int n, const double* x, int incx) { - return cublasDnrm2(n, x, incx); +inline cublasStatus_t cublas_nrm2(cublasHandle_t handle, int n, const double* x, + int incx, double *result) { + return cublasDnrm2_v2(handle, n, x, incx, result); } -inline void cublas_copy(int n, const float* x, int incx, - float* y, int incy) { - cublasScopy(n,x,incx,y,incy); +inline cublasStatus_t cublas_copy(cublasHandle_t handle, int n, const float* x, + int incx, float* y, int incy) { + return cublasScopy_v2(handle,n,x,incx,y,incy); } -inline void cublas_copy(int n, const double* x, int incx, - double* y, int incy) { - cublasDcopy(n,x,incx,y,incy); +inline cublasStatus_t cublas_copy(cublasHandle_t handle, int n, const double* x, + int incx, double* y, int incy) { + return cublasDcopy_v2(handle,n,x,incx,y,incy); } -inline void cublas_scal(int n, float alpha, float* mat, int incx) { - cublasSscal(n, alpha, mat, incx); +inline cublasStatus_t cublas_scal(cublasHandle_t handle, int n, float alpha, + float* mat, int incx) { + return cublasSscal_v2(handle, n, &alpha, mat, incx); } -inline void cublas_scal(int n, double alpha, double* mat, int incx) { - cublasDscal(n, alpha, mat, incx); +inline cublasStatus_t cublas_scal(cublasHandle_t handle, int n, double alpha, + double* mat, int incx) { + return cublasDscal_v2(handle, n, &alpha, mat, incx); } -inline void cublas_axpy(int n, float alpha, const float* x, int incx, float* y, int incy) { - cublasSaxpy(n, alpha, x, incx, y, incy); +inline cublasStatus_t cublas_axpy(cublasHandle_t handle, int n, float alpha, + const float* x, int incx, float* y, int incy) { + return cublasSaxpy_v2(handle, n, &alpha, x, incx, y, incy); } -inline void cublas_axpy(int n, double alpha, const double* x, int incx, double* y, int incy) { - cublasDaxpy(n, alpha, x, incx, y, incy); +inline cublasStatus_t cublas_axpy(cublasHandle_t handle, int n, double alpha, + const double* x, int incx, double* y, int incy) { + return cublasDaxpy_v2(handle, n, &alpha, x, incx, y, incy); } -inline void cublas_gemv(char trans, int m, int n, float alpha, - const float* A, int lda, const float* x, - int incx, float beta, float* y, int incy) { - cublasSgemv(trans,m,n,alpha,A,lda,x,incx,beta,y,incy); +inline cublasStatus_t cublas_gemv(cublasHandle_t handle, cublasOperation_t trans, + int m, int n, float alpha, const float* A, int lda, const float* x, + int incx, float beta, float* y, int incy) { + return cublasSgemv_v2(handle,trans,m,n,&alpha,A,lda,x,incx,&beta,y,incy); } -inline void cublas_gemv(char trans, int m, int n, double alpha, - const double* A, int lda, const double* x, - int incx, double beta, double* y, int incy) { - cublasDgemv(trans,m,n,alpha,A,lda,x,incx,beta,y,incy); +inline cublasStatus_t cublas_gemv(cublasHandle_t handle, cublasOperation_t trans, + int m, int n, double alpha, const double* A, int lda, const double* x, + int incx, double beta, double* y, int incy) { + return cublasDgemv_v2(handle,trans,m,n,&alpha,A,lda,x,incx,&beta,y,incy); } -inline void cublas_spmv(char uplo, int n, float alpha, const float *AP, const float *x, - int incx, float beta, float *y, int incy) { - cublasSspmv(uplo, n, alpha, AP, x, incx, beta, y, incy); +inline cublasStatus_t cublas_spmv(cublasHandle_t handle, cublasFillMode_t uplo, + int n, float alpha, const float *AP, const float *x, int incx, + float beta, float *y, int incy) { + return cublasSspmv_v2(handle, uplo, n, &alpha, AP, x, incx, &beta, y, incy); } -inline void cublas_spmv(char uplo, int n, double alpha, const double *AP, const double *x, - int incx, double beta, double *y, int incy) { - cublasDspmv(uplo, n, alpha, AP, x, incx, beta, y, incy); +inline cublasStatus_t cublas_spmv(cublasHandle_t handle, cublasFillMode_t uplo, + int n, double alpha, const double *AP, const double *x, int incx, + double beta, double *y, int incy) { + return cublasDspmv_v2(handle, uplo, n, &alpha, AP, x, incx, &beta, y, incy); } // Use caution with these, the 'transpose' argument is the opposite of what it @@ -111,22 +152,22 @@ inline void cublas_spmv(char uplo, int n, double alpha, const double *AP, const // had to switch 'l' to 'u'; we view our packed matrices as lower-triangular, // row-by-row, but CUDA views the same layout as upper-triangular, // column-by-column. -inline void cublas_tpmv(char trans, int n, - const float* Ap, float* x, int incx) { - return cublasStpmv('u', trans, 'n', n, Ap, x, incx); +inline cublasStatus_t cublas_tpmv(cublasHandle_t handle, cublasOperation_t trans, + int n, const float* Ap, float* x, int incx) { + return cublasStpmv_v2(handle, CUBLAS_FILL_MODE_UPPER, trans, CUBLAS_DIAG_NON_UNIT, n, Ap, x, incx); } -inline void cublas_tpmv(char trans, int n, const double* Ap, - double* x,int incx) { - return cublasDtpmv('u', trans, 'n', n, Ap, x, incx); +inline cublasStatus_t cublas_tpmv(cublasHandle_t handle, cublasOperation_t trans, + int n, const double* Ap, double* x,int incx) { + return cublasDtpmv_v2(handle, CUBLAS_FILL_MODE_UPPER, trans, CUBLAS_DIAG_NON_UNIT, n, Ap, x, incx); } -inline void cublas_spr(char uplo, int n, float alpha, const float *x, - int incx, float *AP) { - cublasSspr(uplo, n, alpha, x, incx, AP); +inline cublasStatus_t cublas_spr(cublasHandle_t handle, cublasFillMode_t uplo, + int n, float alpha, const float *x, int incx, float *AP) { + return cublasSspr_v2(handle, uplo, n, &alpha, x, incx, AP); } -inline void cublas_spr(char uplo, int n, double alpha, const double *x, - int incx, double *AP) { - cublasDspr(uplo, n, alpha, x, incx, AP); +inline cublasStatus_t cublas_spr(cublasHandle_t handle, cublasFillMode_t uplo, + int n, double alpha, const double *x, int incx, double *AP) { + return cublasDspr_v2(handle, uplo, n, &alpha, x, incx, AP); } #endif diff --git a/src/decoder/Makefile b/src/decoder/Makefile index e38f5ab63b6..95d5c6effca 100644 --- a/src/decoder/Makefile +++ b/src/decoder/Makefile @@ -1,9 +1,9 @@ all: -EXTRA_CXXFLAGS = -Wno-sign-compare -O3 +EXTRA_CXXFLAGS = -Wno-sign-compare include ../kaldi.mk -TESTFILES = +TESTFILES = OBJFILES = training-graph-compiler.o lattice-simple-decoder.o lattice-faster-decoder.o \ lattice-faster-online-decoder.o simple-decoder.o faster-decoder.o \ @@ -13,7 +13,7 @@ LIBNAME = kaldi-decoder ADDLIBS = ../transform/kaldi-transform.a ../tree/kaldi-tree.a ../lat/kaldi-lat.a \ ../sgmm/kaldi-sgmm.a ../gmm/kaldi-gmm.a ../hmm/kaldi-hmm.a ../util/kaldi-util.a \ - ../base/kaldi-base.a ../matrix/kaldi-matrix.a + ../thread/kaldi-thread.a ../matrix/kaldi-matrix.a ../base/kaldi-base.a include ../makefiles/default_rules.mk diff --git a/src/decoder/lattice-faster-decoder.cc b/src/decoder/lattice-faster-decoder.cc index 275335a5ce9..c5c9aae743c 100644 --- a/src/decoder/lattice-faster-decoder.cc +++ b/src/decoder/lattice-faster-decoder.cc @@ -145,12 +145,12 @@ bool LatticeFasterDecoder::GetRawLattice(Lattice *ofst, TopSortTokens(active_toks_[f].toks, &token_list); for (size_t i = 0; i < token_list.size(); i++) if (token_list[i] != NULL) - tok_map[token_list[i]] = ofst->AddState(); + tok_map[token_list[i]] = ofst->AddState(); } // The next statement sets the start state of the output FST. Because we // topologically sorted the tokens, state zero must be the start-state. ofst->SetStart(0); - + KALDI_VLOG(4) << "init:" << num_toks_/2 + 3 << " buckets:" << tok_map.bucket_count() << " load:" << tok_map.load_factor() << " max:" << tok_map.max_load_factor(); @@ -224,6 +224,32 @@ void LatticeFasterDecoder::PossiblyResizeHash(size_t num_toks) { } } +/* + A note on the definition of extra_cost. + + extra_cost is used in pruning tokens, to save memory. + + Define the 'forward cost' of a token as zero for any token on the frame + we're currently decoding; and for other frames, as the shortest-path cost + between that token and a token on the frame we're currently decoding. + (by "currently decoding" I mean the most recently processed frame). + + Then define the extra_cost of a token (always >= 0) as the forward-cost of + the token minus the smallest forward-cost of any token on the same frame. + + We can use the extra_cost to accurately prune away tokens that we know will + never appear in the lattice. If the extra_cost is greater than the desired + lattice beam, the token would provably never appear in the lattice, so we can + prune away the token. + + The advantage of storing the extra_cost rather than the forward-cost, is that + it is less costly to keep the extra_cost up-to-date when we process new frames. + When we process a new frame, *all* the previous frames' forward-costs would change; + but in general the extra_cost will change only for a finite number of frames. + (Actually we don't update all the extra_costs every time we update a frame; we + only do it every 'config_.prune_interval' frames). + */ + // FindOrAddToken either locates a token in hash of toks_, // or if necessary inserts a new, empty token (i.e. with no forward links) // for the current frame. [note: it's inserted if necessary into hash toks_ @@ -352,7 +378,7 @@ void LatticeFasterDecoder::PruneForwardLinksFinal() { if (active_toks_[frame_plus_one].toks == NULL) // empty list; should not happen. KALDI_WARN << "No tokens alive at end of file"; - + typedef unordered_map::const_iterator IterType; ComputeFinalCosts(&final_costs_, &final_relative_cost_, &final_best_cost_); decoding_finalized_ = true; @@ -623,7 +649,7 @@ BaseFloat LatticeFasterDecoder::GetCutoff(Elem *list_head, size_t *tok_count, KALDI_VLOG(6) << "Number of tokens active on frame " << NumFramesDecoded() << " is " << tmp_array_.size(); - + if (tmp_array_.size() > static_cast(config_.max_active)) { std::nth_element(tmp_array_.begin(), tmp_array_.begin() + config_.max_active, @@ -634,7 +660,7 @@ BaseFloat LatticeFasterDecoder::GetCutoff(Elem *list_head, size_t *tok_count, if (adaptive_beam) *adaptive_beam = max_active_cutoff - best_weight + config_.beam_delta; return max_active_cutoff; - } + } if (tmp_array_.size() > static_cast(config_.min_active)) { if (config_.min_active == 0) min_active_cutoff = best_weight; else { @@ -645,7 +671,7 @@ BaseFloat LatticeFasterDecoder::GetCutoff(Elem *list_head, size_t *tok_count, tmp_array_.end()); min_active_cutoff = tmp_array_[config_.min_active]; } - } + } if (min_active_cutoff > beam_cutoff) { // min_active is looser than beam. if (adaptive_beam) *adaptive_beam = min_active_cutoff - best_weight + config_.beam_delta; @@ -673,7 +699,7 @@ BaseFloat LatticeFasterDecoder::ProcessEmitting(DecodableInterface *decodable) { BaseFloat cur_cutoff = GetCutoff(final_toks, &tok_cnt, &adaptive_beam, &best_elem); KALDI_VLOG(6) << "Adaptive beam on frame " << NumFramesDecoded() << " is " << adaptive_beam; - + PossiblyResizeHash(tok_cnt); // This makes sure the hash is always big enough. BaseFloat next_cutoff = std::numeric_limits::infinity(); @@ -761,7 +787,7 @@ void LatticeFasterDecoder::ProcessNonemitting(BaseFloat cutoff) { // it may cause us to process states unnecessarily (e.g. more than once), // but in the baseline code, turning this vector into a set to fix this // problem did not improve overall speed. - + KALDI_ASSERT(queue_.empty()); for (const Elem *e = toks_.GetList(); e != NULL; e = e->tail) queue_.push_back(e->key); @@ -771,7 +797,7 @@ void LatticeFasterDecoder::ProcessNonemitting(BaseFloat cutoff) { warned_ = true; } } - + while (!queue_.empty()) { StateId state = queue_.back(); queue_.pop_back(); diff --git a/src/decoder/lattice-faster-decoder.h b/src/decoder/lattice-faster-decoder.h index 158248cc445..514886d65ee 100644 --- a/src/decoder/lattice-faster-decoder.h +++ b/src/decoder/lattice-faster-decoder.h @@ -54,7 +54,7 @@ struct LatticeFasterDecoderConfig { // LatticeFasterDecoder class itself, but by the code that calls it, for // example in the function DecodeUtteranceLatticeFaster. fst::DeterminizeLatticePhonePrunedOptions det_opts; - + LatticeFasterDecoderConfig(): beam(16.0), max_active(std::numeric_limits::max()), min_active(200), @@ -99,7 +99,7 @@ class LatticeFasterDecoder { typedef Arc::Label Label; typedef Arc::StateId StateId; typedef Arc::Weight Weight; - + // instantiate this class once for each thing you have to decode. LatticeFasterDecoder(const fst::Fst &fst, const LatticeFasterDecoderConfig &config); @@ -117,7 +117,7 @@ class LatticeFasterDecoder { const LatticeFasterDecoderConfig &GetOptions() const { return config_; } - + ~LatticeFasterDecoder(); /// Decodes until there are no more frames left in the "decodable" object.. @@ -230,12 +230,9 @@ class LatticeFasterDecoder { // links from it when we process the next frame. struct Token { BaseFloat tot_cost; // would equal weight.Value()... cost up to this point. - BaseFloat extra_cost; // >= 0. After calling PruneForwardLinks, this equals - // the minimum difference between the cost of the best path, and the cost of - // this is on, and the cost of the absolute best path, under the assumption - // that any of the currently active states at the decoding front may - // eventually succeed (e.g. if you were to take the currently active states - // one by one and compute this difference, and then take the minimum). + BaseFloat extra_cost; // >= 0. This is used in pruning a way tokens. + // there is a comment in lattice-faster-decoder.cc explaining this; + // search for "a note on the definition of extra_cost". ForwardLink *links; // Head of singly linked list of ForwardLinks @@ -365,8 +362,9 @@ class LatticeFasterDecoder { const fst::Fst &fst_; bool delete_fst_; std::vector cost_offsets_; // This contains, for each - // frame, an offset that was added to the acoustic likelihoods on that - // frame in order to keep everything in a nice dynamic range. + // frame, an offset that was added to the acoustic log-likelihoods on that + // frame in order to keep everything in a nice dynamic range i.e. close to + // zero, to reduce roundoff errors. LatticeFasterDecoderConfig config_; int32 num_toks_; // current total #toks allocated... bool warned_; @@ -409,7 +407,7 @@ class LatticeFasterDecoder { void ClearActiveTokens(); - KALDI_DISALLOW_COPY_AND_ASSIGN(LatticeFasterDecoder); + KALDI_DISALLOW_COPY_AND_ASSIGN(LatticeFasterDecoder); }; diff --git a/src/decoder/lattice-faster-online-decoder.h b/src/decoder/lattice-faster-online-decoder.h index 30adb6df302..b69b5492fb7 100644 --- a/src/decoder/lattice-faster-online-decoder.h +++ b/src/decoder/lattice-faster-online-decoder.h @@ -62,7 +62,7 @@ class LatticeFasterOnlineDecoder { BestPathIterator(void *t, int32 f): tok(t), frame(f) { } bool Done() { return tok == NULL; } }; - + // instantiate this class once for each thing you have to decode. LatticeFasterOnlineDecoder(const fst::Fst &fst, const LatticeFasterDecoderConfig &config); @@ -80,7 +80,7 @@ class LatticeFasterOnlineDecoder { const LatticeFasterDecoderConfig &GetOptions() const { return config_; } - + ~LatticeFasterOnlineDecoder(); /// Decodes until there are no more frames left in the "decodable" object.. @@ -107,12 +107,12 @@ class LatticeFasterOnlineDecoder { bool GetBestPath(Lattice *ofst, bool use_final_probs = true) const; - + /// This function does a self-test of GetBestPath(). Returns true on /// success; returns false and prints a warning on failure. bool TestGetBestPath(bool use_final_probs = true) const; - - + + /// This function returns an iterator that can be used to trace back /// the best path. If use_final_probs == true and at least one final state /// survived till the end, it will use the final-probs in working out the best @@ -133,7 +133,7 @@ class LatticeFasterOnlineDecoder { /// while leaving its "nextstate" variable unchanged. BestPathIterator TraceBackBestPath( BestPathIterator iter, LatticeArc *arc) const; - + /// Outputs an FST corresponding to the raw, state-level /// tracebacks. Returns true if result is nonempty. /// If "use_final_probs" is true AND we reached the final-state @@ -152,7 +152,7 @@ class LatticeFasterOnlineDecoder { bool use_final_probs, BaseFloat beam) const; - + /// InitDecoding initializes the decoding, and should only be used if you /// intend to call AdvanceDecoding(). If you call Decode(), you don't need to /// call this. You can also call InitDecoding if you have already decoded an @@ -334,7 +334,7 @@ class LatticeFasterOnlineDecoder { /// Gets the weight cutoff. Also counts the active tokens. BaseFloat GetCutoff(Elem *list_head, size_t *tok_count, BaseFloat *adaptive_beam, Elem **best_elem); - + /// Processes emitting arcs for one frame. Propagates from prev_toks_ to cur_toks_. /// Returns the cost cutoff for subsequent ProcessNonemitting() to use. BaseFloat ProcessEmitting(DecodableInterface *decodable); @@ -343,7 +343,7 @@ class LatticeFasterOnlineDecoder { /// ProcessEmitting() on each frame. The cost cutoff is computed by the /// preceding ProcessEmitting(). void ProcessNonemitting(BaseFloat cost_cutoff); - + // HashList defined in ../util/hash-list.h. It actually allows us to maintain // more than one list (e.g. for current and previous frames), but only one of // them at a time can be indexed by StateId. It is indexed by frame-index @@ -361,9 +361,10 @@ class LatticeFasterOnlineDecoder { // make it class member to avoid internal new/delete. const fst::Fst &fst_; bool delete_fst_; - std::vector cost_offsets_; // This contains, for each - // frame, an offset that was added to the acoustic likelihoods on that - // frame in order to keep everything in a nice dynamic range. + std::vector cost_offsets_; // This contains, for each + // frame, an offset that was added to the acoustic log-likelihoods on that + // frame in order to keep everything in a nice dynamic range i.e. close to + // zero, to reduce roundoff errors. LatticeFasterDecoderConfig config_; int32 num_toks_; // current total #toks allocated... bool warned_; diff --git a/src/decoder/lattice-tracking-decoder.h b/src/decoder/lattice-tracking-decoder.h index 91484b56c60..0737ca3db36 100644 --- a/src/decoder/lattice-tracking-decoder.h +++ b/src/decoder/lattice-tracking-decoder.h @@ -74,7 +74,7 @@ struct LatticeTrackingDecoderConfig { } void Check() const { - KALDI_ASSERT(beam > 0.0 && max_active > 1 && lattice_beam > 0.0 + KALDI_ASSERT(beam > 0.0 && max_active > 1 && lattice_beam > 0.0 && prune_interval > 0 && beam_delta > 0.0 && hash_ratio >= 1.0 && extra_beam >= 0.0 && max_beam >= beam); } @@ -135,7 +135,7 @@ class LatticeTrackingDecoder { /// format. bool Decode(DecodableInterface *decodable, const fst::StdVectorFst &arc_graph); - + /// says whether a final-state was active on the last frame. If it was not, the /// lattice (or traceback) will end with states that are not final-states. bool ReachedFinal() const { return final_active_; } @@ -167,7 +167,7 @@ class LatticeTrackingDecoder { /// final-probs as one. bool GetLattice(fst::MutableFst *ofst, bool use_final_probs = true) const; - + private: struct Token; // ForwardLinks are the links from a token to a token on the next frame. @@ -181,13 +181,13 @@ class LatticeTrackingDecoder { ForwardLink *next; // next in singly-linked list of forward links from a // token. inline ForwardLink(Token *next_tok, Label ilabel, Label olabel, - BaseFloat graph_cost, BaseFloat acoustic_cost, + BaseFloat graph_cost, BaseFloat acoustic_cost, ForwardLink *next): next_tok(next_tok), ilabel(ilabel), olabel(olabel), - graph_cost(graph_cost), acoustic_cost(acoustic_cost), + graph_cost(graph_cost), acoustic_cost(acoustic_cost), next(next) { } - }; - + }; + // Token is what's resident in a particular state at a particular time. // In this decoder a Token actually contains *forward* links. // When first created, a Token just has the (total) cost. We add forward @@ -200,19 +200,19 @@ class LatticeTrackingDecoder { // that any of the currently active states at the decoding front may // eventually succeed (e.g. if you were to take the currently active states // one by one and compute this difference, and then take the minimum). - + ForwardLink *links; // Head of singly linked list of ForwardLinks - + Token *next; // Next in list of tokens for this frame. - + StateId lat_state; // current state in graph arc lattice from first pass decoding // lat_state == fst::kNoStateId means that this token is not tracked - + inline Token(BaseFloat tot_cost, BaseFloat extra_cost, ForwardLink *links, Token *next, StateId lat_state): tot_cost(tot_cost), extra_cost(extra_cost), links(links), next(next), lat_state(lat_state) { } inline void DeleteForwardLinks() { - ForwardLink *l = links, *m; + ForwardLink *l = links, *m; while (l != NULL) { m = l->next; delete l; @@ -221,7 +221,7 @@ class LatticeTrackingDecoder { links = NULL; } }; - + // head and tail of per-frame list of Tokens (list is in topological order), // and something saying whether we ever pruned it using PruneForwardLinks. struct TokenList { @@ -231,7 +231,7 @@ class LatticeTrackingDecoder { TokenList(): toks(NULL), must_prune_forward_links(true), must_prune_tokens(true) { } }; - + typedef HashList::Elem Elem; void PossiblyResizeHash(size_t num_toks); @@ -248,7 +248,7 @@ class LatticeTrackingDecoder { // lat_state is the next state in the arc graph lattice inline Token *FindOrAddToken(StateId state, StateId lat_state, int32 frame, BaseFloat tot_cost, bool *changed); - + // prunes outgoing links for all tokens in active_toks_[frame] // it's called by PruneActiveTokens // all links, that have link_extra_cost > lattice_beam are pruned @@ -267,13 +267,13 @@ class LatticeTrackingDecoder { // on the final frame. If there are final tokens active, it uses // the final-probs for pruning, otherwise it treats all tokens as final. void PruneForwardLinksFinal(int32 frame); - + // Prune away any tokens on this frame that have no forward links. // [we don't do this in PruneForwardLinks because it would give us // a problem with dangling pointers]. // It's called by PruneActiveTokens if any forward links have been pruned void PruneTokensForFrame(int32 frame); - + // Go backwards through still-alive tokens, pruning them. note: cur_frame is // where hash toks_ are (so we do not want to mess with it because these tokens // don't yet have forward pointers), but we do all previous frames, unless we @@ -286,7 +286,7 @@ class LatticeTrackingDecoder { /// Version of PruneActiveTokens that we call on the final frame. /// Takes into account the final-prob of tokens. void PruneActiveTokensFinal(int32 cur_frame); - + /// Gets the weight cutoff. Also counts the active tokens. BaseFloat GetCutoff(Elem *list_head, size_t *tok_count, BaseFloat *adaptive_beam, Elem **best_elem); @@ -311,9 +311,10 @@ class LatticeTrackingDecoder { std::vector tmp_array_; // used in GetCutoff. // make it class member to avoid internal new/delete. const fst::Fst &fst_; - std::vector cost_offsets_; // This contains, for each - // frame, an offset that was added to the acoustic likelihoods on that - // frame in order to keep everything in a nice dynamic range. + std::vector cost_offsets_; // This contains, for each + // frame, an offset that was added to the acoustic log-likelihoods on that + // frame in order to keep everything in a nice dynamic range i.e. close to + // zero, to reduce roundoff errors. LatticeTrackingDecoderConfig config_; int32 num_toks_; // current total #toks allocated... bool warned_; @@ -331,9 +332,9 @@ class LatticeTrackingDecoder { // to the caller, who then has to call toks_.Delete(e) for each one. It was designed // this way for convenience in propagating tokens from one frame to the next. void ClearToks(Elem *list); - + void ClearActiveTokens(); - + }; diff --git a/src/doc/README b/src/doc/README index ea30b348450..566f0d0bf64 100644 --- a/src/doc/README +++ b/src/doc/README @@ -3,7 +3,7 @@ #code itself, and its comments, is the rest of the source). Doxygen will create #the actual documentation in ../html/ (e.g. open ../html/index.html in a browser). #To run doxygen, type "doxygen" from one directory above this. If this does -#not work, search for "Kaldi main page" online and you will hopefully get a +#not work, search for "Kaldi main page" online and you will hopefully get a #version of the documentation. # Note: I generally run this file by typing ". doc/README" from src/, @@ -13,7 +13,7 @@ #ssh-keygen -t dsa -C "vpanayotov@shell.sf.net" #ssh-add # end then import the contents of .ssh/id_dsa.pub into -# http://sourceforge.net/account/services +# http://sourceforge.net/account/services #(from Dan:) The commands below show how I compile the documentation and copy it #to the homepage at sourceforge. I do this from JHU at the current time. @@ -24,7 +24,7 @@ doxygen cp doc/*.pptx html/; # get the style sheet in the html/ directory. # note, we actually use a modified version of the header, which is checked into -# doc/. +# doc/. doxygen -w html header.html footer.html stylesheet.css rm header.html footer.html mv stylesheet.css html/ @@ -34,37 +34,21 @@ if [[ $(hostname -f) == *.clsp.jhu.edu ]]; then cp ../misc/logo/KaldiIco.png html/favicon.ico tar -czf html.tar.gz html - echo "**First copying to kaldi-asr.org**" - # First copy to kaldi-asr.org/docs + # Copy to kaldi-asr.org/docs2/ scp html.tar.gz newrelay:/var/www/kaldi-asr echo 'cd /var/www/kaldi-asr/; rm -rf html doc.old; - tar -xzf html.tar.gz; mv doc doc.old; mv html doc; rm -rf doc.old' \ + tar -xzf html.tar.gz; mv doc doc.old; mv html doc; rm -rf doc.old; rm html.tar.gz' \ | ssh newrelay bash - - echo "**Now copying to sourceforge**" - # Next copy to sourceforge. - if true; then # use method that works when their shell access is down. - rm -rf htdocs # make sure it's not left over from before. - mv html htdocs - scp -r htdocs danielpovey@web.sourceforge.net:/home/project-web/kaldi/ - mv htdocs html - else - scp html.tar.gz danielpovey@web.sourceforge.net:/home/project-web/kaldi/htdocs/ - ssh danielpovey,kaldi@shell.sourceforge.net create - echo 'cd /home/project-web/kaldi/htdocs/; rm -rf html; - tar -xzf html.tar.gz; for x in html/*; do mv $x .; done ' \ - | ssh danielpovey,kaldi@shell.sourceforge.net bash - fi fi # You could uncomment and run the lines below as an example of how to figure out # the amount of posts to the Kaldi forums on Sourceforge, per month. #curl 'http://sourceforge.net/p/kaldi/discussion/stats_data?forum=&begin=2011-04-14&end=2014-06-13' > foo -#cat foo | perl -ane ' s/.*://; @A = split("]"); +#cat foo | perl -ane ' s/.*://; @A = split("]"); # foreach $a(@A){ $a =~ s/[,\[]//g; print "$a\n"; }' | \ -# perl -e 'while(<>) { @A = split; if (@A == 2) { ($date, $count) = @A; $date /= 1000; +# perl -e 'while(<>) { @A = split; if (@A == 2) { ($date, $count) = @A; $date /= 1000; # @date_array = gmtime $date; $month = $date_array[4]; $year = 1900 + $date_array[5]; $count{$year. " " .sprintf("%02d", $month+1)} += $count; }} # foreach $k (sort keys %count) { print "$k $count{$k}\n"; } ' @@ -78,5 +62,3 @@ fi # and added it to the repo. # - - diff --git a/src/doc/chain.dox b/src/doc/chain.dox new file mode 100644 index 00000000000..9aa515d5b0e --- /dev/null +++ b/src/doc/chain.dox @@ -0,0 +1,424 @@ +// doc/chain.dox + +// Copyright 2015 Johns Hopkins University (author: Daniel Povey) + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at + +// http://www.apache.org/licenses/LICENSE-2.0 + +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + +namespace kaldi { + +/** + \page chain 'Chain' models + + \section chain_intro Introduction to 'chain' models + + The 'chain' models are a type of DNN-HMM model, implemented using \ref dnn3 "nnet3", and differ from the + conventional model in various ways; you can think of them as a different + design point in the space of acoustic models. + + - We use a 3 times smaller frame rate at the output of the neural net, + This significantly reduces the amount of computation required in + test time, making real-time decoding much easier. + - The models are trained right from the start with a sequence-level + objective function-- namely, the log probability of the correct sequence. It is + essentially MMI implemented without lattices on the GPU, by doing a full + forward-backward on a decoding graph derived from a phone n-gram language + model. + - Because of the reduced frame rate, we need to use unconventional + HMM topologies (allowing the traversal of the HMM in one state). + - We use fixed transition probabilities in the HMM, and don't train + them (we may decide train them in future; but for the most part the neural-net + output probabilities can do the same job as the transition probabilities, + depending on the topology). + - Currently, only nnet3 DNNs are supported (see \ref dnn3), and + online decoding has not yet been implemented (we're aiming for April to June 2016). + - Currently the results are a bit better then those of conventional + DNN-HMMs (about 5\% relative better), but the system is about 3 times + faster to decode; training time is probably a bit faster too, but + we haven't compared it exactly. + + \section chain_scripts Where to find scripts for the 'chain' models + + The current best scripts for the 'chain' models can be found in the + Switchboard setup in egs/swbd/s5c; the script local/chain/run_tdnn_2o.sh is + the current best one. This is currently available in the 'chain' branch of + the official github repository (https://github.com/kaldi-asr/kaldi.git) and + eventually will be merged to the master. + + This script uses TDNNs as the neural net (we've been doing the development + with TDNNs because they are easier to tune then LSTMs), and gives a better WER + WER than the baseline TDNN: 11.4\%, versus 12.1\% for the best TDNN baseline + (on the Switchboard-only portion of eval2000). + + \section chain_model The chain model + + The chain model itself is no different from a conventional DNN-HMM, used with + a (currently) 3-fold reduced frame rate at the output of the DNN. The input + features of the DNN are at the original frame rate of 100 per second; this makes + sense because all the neural nets we are currently using (LSTMs, TDNNs) have some kind + of recurrent connections or splicing inside them, i.e. they are not purely feedforward + nets. + + The difference from a normal model is the objective function used to train it: + instead of a frame-level objective, we use the log-probability of the correct + phone sequence as the objective function. The training process is quite + similar in principle to MMI training, in which we compute numerator and + denominator 'occupation probabilities' and the difference between the two is + used in the derivative computation. There is no need to normalize the DNN + outputs to sum to one on each frame any more- such normalization makes no difference. + + Because of the reduced frame rate (one frame every 30 ms), we need to use a + modified HMM topology. We would like the HMM to be traversable in one + transition (as opposed to the 3 transitions of a model mat the normal frame + rate). The currently favored topology has a state that can only occur once, + and then another state that can appear zero or more times. The state-clustering + is obtained using the same procedure as for GMM-based models, although + of course with a different topology (we convert the alignments to the new topology + and frame-rate). + + \section chain_training The training procedure for 'chain' models + + The training procedure for chain models is a lattice-free version of + MMI, where the denominator state posteriors are obtained by the + forward-backward algorithm over a HMM formed from a phone-level decoding graph, + and the numerator state posteriors are obtained by a similar forward-backward + algorithm but limited to sequences corresponding to the transcript. + + For each output index of the neural net (i.e. for each pdf-id), we + compute a derivative of of the form (numerator occupation probability - + denominator occupation probability), and these are propagated back to the + network. + + + \subsection chain_training_denominator The denominator FST + + For the denominator part of the computation we do forward-backward over a HMM. + Actually, because we represent it as a finite state acceptor, the labels + (pdf-ids) are associated with the arcs and not the states, so it's not really a + HMM in the normal formulation, but it's easier think of it as a HMM because + we use the forward-backward algorithm to get posteriors. + In the code and scripts we refer to it as the 'denominator FST'. + + \subsubsection chain_training_denominator_phone_lm Phone language model for the denominator FST + + The first stage in constructing the denominator FST is to create a phone + language model. This language model is learned from the training-data phone + alignments. This is an un-smoothed language model, meaning that we never do + backoff to lower order n-grams. However, some language-model states are + removed entirely, so transitions to those states go instead to the lower-order + n-gram's state. The reason we avoid smoothing is to reduce the number of + arcs that there will be in the compiled graph after phonetic context expansion. + + The configuration that we settled on is to estimate a 4-gram language model, + and to never prune LM states below trigram (so we always maintain at least a + 2-phone history). On top of the number of states dictated by the no-prune + trigram rule, we have a specifiable number (e.g. 2000) of 4-gram language + model states which are to be retained (all the rest are identified with the + corresponding trigram state), and the ones we choose to retain are determined + in a way that maximizes the training-data likelihood. All probabilities are + estimated to maximize the training-data likelihood. The reason not to prune + the trigrams is that any sparsity of which trigrams are allowed, will tend to + minimize the size of the compiled graph. Note that if our phone LM was just a + simple phone loop (i.e. a unigram), it would get expanded to triphones anyway + due to phonetic context effects, but it would have arcs for all possible + trigrams in it. So any sparsity we get from using the un-pruned trigram model + is a bonus. Empirically, an un-smoothed trigram LM is what expands to the + smallest possible FST; and pruning some of the trigrams, while it increases + the size of the compiled FST, results in little or no WER improvement (at + least on 300 hours of data expanded 3-fold with speed perturbation; on less + data it might help). + + On the Switchboard setups the phone-LM perplexities for the various models we + tried were in the range 5 to 7; the phone-LM perplexity with our chosen + configuration (4-gram, pruned to trigram for all but 2000 states) was about 6. + It was not the case that lower phone-LM perplexity always led to better WER + of the trained system; as for conventional (word-based) MMI training, an + intermediate strength of language model seemed to work best. + + \subsubsection chain_training_denominator_compilation Compilation of the denominator FST + + The phone language model described in the previous section is expanded into a + FST with 'pdf-ids' as the arcs, in a process that mirrors the process of + decoding-graph compilation in normal Kaldi decoding (see \ref + graph_recipe_test), except that there is no lexicon is involved, and at the + end we convert the transition-ids to pdf-ids. + + One difference lies in how we minimize the size of the graph. The normal + recipe involves determinization and minimization. We were not able to + reduce the size of the graph using this procedure, or variants of it with + disambiguation symbols. Instead, our graph-minimization process can be described + compactly as follows: "Repeat 3 times: push, minimize, reverse; push, minimize reverse.". + 'push' refers to weight-pushing; 'reverse' refers to reversing the directions of arcs, and + swapping initial and final states. + + + \subsubsection chain_training_denominator_normalization Initial and final probabilities, and 'normalization FST' + + The graph-creation process mentioned above naturally gives us an initial + state, and final probabilities for each state; but these are not the ones we + use in the forward-backward. The reason is that these probabilities are + applicable to utterance boundaries, but we train on split-up chunks of + utterance of a fixed length (e.g. 1.5 seconds). Constraining the HMM at these + arbitrarily chosen cut points to the initial and final states is not + appropriate. Instead, we use initial probabilities derived from 'running the HMM' for + a fixed number of iterations and averaging the probabilities; and final probabilities + equal to 1.0 for each state. We have a justification for this but don't have time to + explain it right now. In the denominator forward-backward process we apply these initial and + final probabilities to the initial and final frame as part of the computation. However, we also + write out a version of the denominator FST that has these initial and final probabilities, and we refer to + this as the 'normalization FST.' (The initial probabilities are emulated using epsilon arcs, because + FSTs do not support initial probabilities). This 'normalization FST' will be used to add probabilities to the + numerator FSTs in a way that we'll describe later. + + \subsection chain_training_numerator Numerator FSTs + + As part of our preparation for the training process we produce something + called a 'numerator FST' for each utterance. The numerator FST encodes the + supervision transcript, and also encodes an alignment of that transcript + (i.e. it forces similarity to a reference alignment obtained from a baseline + system), but it allows a little 'wiggle room' to vary from that reference. + By default we allow a phone to occur 0.05 seconds before or after its + begin and end position respectively, in the lattice alignment. + Incorporating the alignment information is important because of the way we + train not on entire utterances but on split-up fixed-length pieces of + utterances (which, in turn, is important for GPU-based training): splitting up + the utterance into pieces if we know where the transcript aligns. + + Instead of enforcing a particular pronunciation of the training data, we use as + our reference a lattice of alternative pronunciations of the training data, + generated by a lattice-generating decoding procedure using an + utterance-specific graph as the decoding graph. This generates all alignments + of pronunciations that were within a beam of the best-scoring pronunciation. + + \subsubsection chain_training_numerator_splitting Splitting the numerator FSTs + + As mentioned, we train on fixed sized pieces of utterances (e.g. 1.5 seconds in + length). This requires that we split up the numerator FSTs up into fixed-size + pieces. This isn't hard, since the numerator FSTs (which, remember, encode + time-alignment information), naturally have a structure where we can identify + any FST state with a particular frame index. Note: at the stage where we do this + splitting, there are no costs in the numerator FST yet-- it's just viewed as + encoding a constraint on paths-- so we do not have to make a decision how to split up the costs +on the paths. + + \subsubsection chain_training_numerator_normalization Normalizing the numerator FSTs + + Above (\ref chain_training_denominator_compilation) we mentioned how we compute + initial and final probabilities for the denominator FST, and how we encode + these in a 'normalization FST'. We compose the split-up pieces of numerator + FST with this this 'normalization FST' to ensure that the costs from the + denominator FST are reflected in the numerator FST. This ensures that + objective functions can never be positive (which makes them easier to + interpret), and also guards against the possibility that the numerator FST + could contain state sequences not allowed by the denominator FST, which in + principle could allow the objective function to increase without bound. The + reason why this could happen is that the phone LM lacks smoothing, and is + estimated from 1-best alignments, so the lattices could contain phone n-grams + sequences not seen in training. + + It happens occasionally (but very rarely) that this normalization process + generates an empty FST: this can occur when the lattice contains triphones that + were not not present in the 1-best alignment used to train the phone language + model, and does not have any alternative paths at that point in the lattice + that could make up for the resulting 'failed' paths. This can happen because + the 1-best alignment and the lattice-producing alignment chose different + pronunciations of a word. These pieces of utterances are just discarded. + + \subsubsection chain_training_numerator_normalization Format of the numerator FSTs + + The numerator FSTs are weighted acceptors where the labels correspond to + pdf-ids plus one. We can't use pdf-ids, because they could be zero; and zero + is treated specially (as epsilon) by OpenFst. When we form minibatches, instead + of storing an array of separate numerator FSTs we actually append them together to form a longer FST; + this enables us to do a single forward-backward over all utterances in the minibatch, + which directly computes the total numerator log-probability. (This isn't an important + feature, it's just a software detail, which we explain here lest it generate confusion). + + \subsection chain_training_splitting Fixed-length chunks, and minibatches + + In order to train on minibatches, we split up our utterances into fixed-length + chunks of speech (of length 1.5 seconds in our current scripts). Utterances + shorter than this are discarded; those longer, are split into chunks with + either overlaps between the chunks, or small gaps between the chunks. Note that + our acoustic models typically require left or right frames for acoustic + context; we add that, but this is separate issue; the context is added after + the chunks are decided on. + + Our minibatch size is usually a power of 2, and it can be limited by GPU + memory considerations. Many of our example scripts use 128 chunks per + minibatch. The largest single consumer of GPU memory is the alpha + probabilities in the forward-backward computation. For instance, with 1.5 + second chunk, we have 50 time steps after the 3-fold subsampling. In our + Switchboard setup a typical denominator FST has 30,000 states in it. We use + single-precision floating point for the alphas, so the memory used in + gigabytes is (128 * 50 * 30000 * 4) / 10^9 = 0.768G. + + This won't use up all the GPU memory, but there are other sources of memory, + e.g. we keep around two copies of the nnet outputs in memory, which takes a + fair amount of memory depending on the configuration-- e.g. replace the 30000 + above with about 10000 and it will give you the amount of memory used for one + copy of the nnet outputs in a reasonable configuration. + + + \subsection chain_training_shifting Training on frame-shifted data + + In neural net training we already have ways of generating perturbed data to + artificially increase the amount of data we train on. Our standard nnet3 + neural-net training example scripts do time-warping of the raw audio, by + factors of 0.9, 1.0 and 1.0, to create 3-fold augmented data. This is + orthogonal to the 'chain' models, and we do it (or not) just as we would for + the baseline. However, there is an extra way we can augment the data for the + chain models, by shifting the frames. The output frame rate for these models + is one third the regular frame rate (configurable, of course), meaning we only + evaluate nnet output at t values that are multiples of 3, so we + can generate different versions of the training data by shifting the training + examples by 0, 1 and 2 frames. This is done automatically in the training + script, and it's done 'on the fly' as we read the training examples from + disk-- the program nnet3-chain-copy-egs has a + --frame-shift option that is set by the script. This affects how + the number of epochs is interpreted. If the user requests, for instance, 4 + epochs, then we actually train for 12 epochs; we just do so on 3 + differently-shifted versions of the data. What the option + --frame-shift=t option actually does is to shift the input frames + by t and shift the output frames by the closest multple of 3 to + t. (In general it might not be 3, it's a configuration variable + named --frame-subsampling-factor). + + \subsection chain_training_gpu GPU issues in training + + The parts of the computation that are specific to the 'chain' computation are + the forward-backward over the numerator FST and over the denominator HMM. The + numerator part of this is very fast. The denominator forward-backward takes + quite a lot of time, because there can be a lot of arcs in the + denominator FST (e.g. 200,000 arcs and 30,000 states in a typical Switchboard setup). + The time taken can be almost as much as the time taken in the neural-net + parts of the computation. We were quite careful to ensure memory locality. + + The next step to further speed this up is probably to implement a pruned + version of the forward-backward computation (like pruned Viterbi, but + computing posteriors). In order to get a speedup we'd have to prune away a + very high percentage of states, because we'd need to make up for the loss of + memory locality that pruning would bring. In our current implementation we are + careful to ensure that a group of GPU threads are all processing the same + HMM-state and time, just from different chunks (we call these different + 'sequences' in the code); and we make sure that the memory locations + corresponding to a these different sequences are all next to each other in + memory, so the GPU can do 'consolidated memory access'. With state-level + pruning, since the memory access for the different sequences would no longer be + 'in sync', we would lose this advantage. It should still be doable to get a + pruned version of the forward-backward algorithm, though. + + For speed, we don't use log values in the alpha-beta computation for the + denominator graph. In order to keep all the numerical values in a suitable + range, we multiply all the acoustic probabilities (exponentiated nnet outputs) + on each frame, by an 'arbitrary value' selected to ensure that our alpha scores + stay in a good range. We call this an 'arbitrary value' because the algorithm + is designed so that we could choose any value here, and it would still be + mathematically correct. We designate one HMM state as a 'special state', and + the 'arbitrary constant' is chosen is the inverse of that special state's alpha + on the previous frame. This keeps the special state's alpha values close to + one. As the 'special state' we choose a state that has high probability in the + limiting distribution of the HMM, and which can access the majority of states + of the HMM. + + \section chain_decoding Decoding with 'chain' models + + The decoding process with 'chain' models is exactly the same as for regular nnet3 + neural-net based models, and in fact uses the same script (steps/nnet3/decode.sh). + There are a few configuration differences: + + - Firstly, the graph is built with a different and simpler topology; but this requires + no special action by the user, as the graph-building script anyway takes the + topology from the 'final.mdl' produced by the 'chain' training script, which + contains the correct topology. + + - By default when we compile the graph, we use a 'self-loop-scale' of 0.1. + This affects how the transition probabilities on self-loops are treated + (it generally works better). However, for the 'chain' models, because of + how they were trained, we need to use exactly the same + transition-probability scaling we trained with, which for simplicity we + have set to 1.0. So we supply the option --self-loop-scale + 1.0 to the utils/mkgraph.sh script. + + - There is no 'division by the prior' necessary in these models. So we simply + don't set the vector of priors in the .mdl files; we made sure + that the decoder just omits the division by the prior if the priors are not set. + + - The default acoustic scale we typically use in decoding (0.1) is not + suitable-- for 'chain' models the optimal acoustic scale is very close to 1. + So we supply the option --acwt 1.0 to the script + steps/nnet3/decode.sh. + + - The scoring scripts can only search the language-model scale in increments + of 1, which works well in typical setups where the optimal language model scale + is between 10 and 15, but not when the optimal language-model scale is close + to 1 as it here. (Note: for current purposes you can treat the language-model + scale as the same as the inverse of the acoustic scale). In order to + work around this issue without changing the scoring scripts (which are + database-specific), we supply a new option --post-decode-acwt 10.0 + to the script steps/nnet3/decode.sh, + which scales the acoustic probabilities by 10 before dumping the lattice. + After this, the optimal language-model scale will be around 10, which might + be a little confusing if you are not aware of this issue, but is convenient + for the way the scoring scripts are set up. + + - The default decoding and lattice beams are suitable without modification + for the 'chain' models, once you use the --acwt 1.0 option. + However, they won't show the full possible speedup and you can get faster + decoding by using slightly tighter beams. By tightening the beam in the + Switchboard setup we were able to get decoding time down from around 1.5 + times real time to around 0.5 times real time, with only around 0.2\% + degradation in accuracy (this was with neural net evaluation on the CPU; on + the GPU it would have been even faster). Note from Dan: this is all to the best + of my recollection as I write this; actually the degradation may have been more than + that. And bear in mind that this was on high-powered modern server machines + (single-threaded). + + You might notice in the current example scripts that we use iVectors. We do so + just because they generally help a bit, and because the baseline setup we were + comparing with, uses them. There is no inherent connection with 'chain' + models, and no fundamental requirement to use them. Actually we want to get rid + of them (see below). + + + \section chain_next_steps Next steps (TODOs) with 'chain' models + + (Note: this list is valid as of Dec 13 2015, but may become out of date). + Things we need to do (and that we'd like help with) are: + - Supply example scripts (and tune them) on a wide range of corpora + (It will be interesting to see whether there are scale-dependent effects + affecting how well this model works). + - Create and tune LSTM and BLSTM versions of the training script. (This + may involve some playing around with learning rate schedules and + configurations). + - Figure out how to speed up the forward-backward part of the computation. + (E.g. using state-level pruning, or just by optimizing the current kernels or + data structures). + + A longer-term TODO, which Dan should do, is to create an online decoding setup + for these models. Actually this isn't really distinct from nnet3 online + decoding in general, since the models are no different from regular nnet3 + acoustic models. But we do have to decide whether to continue to support + iVectors-- getting rid of them would simplify the setup considerably, and + would hopefully make it more robust. We are hoping that with LSTMs, since it + already sees quite a wide acoustic context, iVector adaptation will no longer + be as helpful and could be dropped. We also have other ideas how to + incorporate adaptation as part of the neural network, without the use of + iVectors. This will require some experimentation. + + +*/ + +} diff --git a/src/doc/cpplint.py b/src/doc/cpplint.py index 837620b0b68..03d0569ab1c 100755 --- a/src/doc/cpplint.py +++ b/src/doc/cpplint.py @@ -2567,8 +2567,8 @@ def CheckLanguage(filename, clean_lines, linenum, file_extension, include_state, error(filename, linenum, 'runtime/memset', 4, 'Did you mean "memset(%s, 0, %s)"?' % (match.group(1), match.group(2))) - - if Search(r'\busing namespace\b', line): + match = Search(r'\busing namespace kaldi\b',line) + if not match and Search(r'\busing namespace\b', line): error(filename, linenum, 'build/namespaces', 5, 'Do not use namespace using-directives. ' 'Use using-declarations instead.') diff --git a/src/doc/data_prep.dox b/src/doc/data_prep.dox index ecf0ecc67b9..9db285b340b 100644 --- a/src/doc/data_prep.dox +++ b/src/doc/data_prep.dox @@ -25,11 +25,11 @@ After running the example scripts (see \ref tutorial), you may want to set up Kaldi to run with your own data. This section explains how to prepare the data. This page will assume that you are using the latest version of the example scripts - (typically named "s5" in the example directories, e.g. egs/rm/s5/). + (typically named "s5" in the example directories, e.g. egs/rm/s5/). In addition to this page, you can refer to the data preparation scripts in those directories. The top-level run.sh scripts (e.g. egs/rm/s5/run.sh) have a few commands at the top of them that relate to various phases of data preparation. The parts in - the sub-directory named local/ are always specific to the database. For example, + the sub-directory named local/ are always specific to the database. For example, in the Resource Management (RM) setup it is local/rm_data_prep.sh. In the case of RM these commands are: \verbatim @@ -85,7 +85,7 @@ cmvn.scp feats.scp reco2file_and_channel segments spk2utt text utt2spk wa Not all of the files are equally important. For a simple setup where there is no "segmentation" information (i.e. each utterance corresponds to a single file), the only files you have to create yourself are "utt2spk", "text" and "wav.scp" and possibly -"segments" and "reco2file_and_channel", and the rest will be created by standard scripts. +"segments" and "reco2file_and_channel", and the rest will be created by standard scripts. We will describe the files in this directory, starting with the files you need to create yourself. @@ -95,7 +95,7 @@ yourself. The file "text" contains the transcriptions of each utterance. \verbatim s5# head -3 data/train/text -sw02001-A_000098-001156 HI UM YEAH I'D LIKE TO TALK ABOUT HOW YOU DRESS FOR WORK AND +sw02001-A_000098-001156 HI UM YEAH I'D LIKE TO TALK ABOUT HOW YOU DRESS FOR WORK AND sw02001-A_001980-002131 UM-HUM sw02001-A_002736-002893 AND IS \endverbatim @@ -104,12 +104,12 @@ but if you have speaker information in your setup, you should make the speaker-i prefix of the utterance id; this is important for reasons relating to the sorting of these files. The rest of the line is the transcription of each sentence. You don't have to make sure that all words in this file are in your vocabulary; out of vocabulary words will -get mapped to a word specified in the file data/lang/oov.txt. +get mapped to a word specified in the file data/lang/oov.txt. Note: although, in this particular example we have used an underscore to separate the "speaker" and "utterance" parts of the utterance-id, in general it is probably safer to use a dash ("-"). This is because it has a lower ASCII value; it has been pointed out -to me that if an underscore is used, and if the speaker-ids vary in length, in certain -cases the speaker-ids and their corresponding utterance ids can end up being sorted in +to me that if an underscore is used, and if the speaker-ids vary in length, in certain +cases the speaker-ids and their corresponding utterance ids can end up being sorted in different orders when using the standard "C"-style ordering on strings. \endverbatim Another important file is wav.scp. In the Switchboard example, @@ -118,7 +118,7 @@ s5# head -3 data/train/wav.scp sw02001-A /home/dpovey/kaldi-trunk/tools/sph2pipe_v2.5/sph2pipe -f wav -p -c 1 /export/corpora3/LDC/LDC97S62/swb1/sw02001.sph | sw02001-B /home/dpovey/kaldi-trunk/tools/sph2pipe_v2.5/sph2pipe -f wav -p -c 2 /export/corpora3/LDC/LDC97S62/swb1/sw02001.sph | \endverbatim -The format of this file is +The format of this file is \verbatim \endverbatim @@ -135,7 +135,7 @@ sw02001-A_000098-001156 sw02001-A 0.98 11.56 sw02001-A_001980-002131 sw02001-A 19.8 21.31 sw02001-A_002736-002893 sw02001-A 27.36 28.93 \endverbatim -The format of the "segments" file is: +The format of the "segments" file is: \verbatim \endverbatim @@ -146,7 +146,7 @@ an arbitrary identifier that you can choose. The file "reco2file_and_channel" is only used when scoring (measuring error rates) with NIST's "sclite" tool: \verbatim -s5# head -3 data/train/reco2file_and_channel +s5# head -3 data/train/reco2file_and_channel sw02001-A sw02001 A sw02001-B sw02001 B sw02005-A sw02005 A @@ -156,7 +156,7 @@ The format is: \endverbatim The filename is typically the name of the .sph file, without the suffix, but in -general it's whatever identifier you have in your "stm" file. +general it's whatever identifier you have in your "stm" file. The recording side is a concept that relates to telephone conversations where there are two channels, and if not, it's probably safe to use "A". If you don't have an "stm" file or you have no idea what this is all about, then you don't need @@ -202,8 +202,8 @@ All of these files should be sorted. If they are not sorted, you will get error when you run the scripts. In \ref io_sec_tables we explain why this is needed. It has to do with the I/O framework; the ultimate reason for the sorting is to enable something equivalent to random-access lookup on a stream that doesn't support -fseek(), such as a piped command. Many Kaldi programs are reading multiple pipes -from other Kaldi commands, reading different types of object, and are doing something +fseek(), such as a piped command. Many Kaldi programs are reading multiple pipes +from other Kaldi commands, reading different types of object, and are doing something roughly comparable to merge-sort on the different inputs; merge-sort, of course, requires that the inputs be sorted. Be careful when you sort that you have the shell variable LC_ALL defined as "C", @@ -249,37 +249,37 @@ that is what we use in this particular script. The format is: \verbatim \endverbatim -Each of the feature files contains a matrix, in Kaldi format. -In this case the dimension of the matrix would be (the length of the file in 10ms intervals) by 13. +Each of the feature files contains a matrix, in Kaldi format. +In this case the dimension of the matrix would be (the length of the file in 10ms intervals) by 13. The "extended filename" /home/dpovey/kaldi-trunk/egs/swbd/s5/mfcc/raw_mfcc_train.1.ark:24 means, open the "archive" file /home/dpovey/kaldi-trunk/egs/swbd/s5/mfcc/raw_mfcc_train.1.ark, fseek() -to position 24, and read the data that's there. +to position 24, and read the data that's there. This feats.scp file is created by the command \verbatim -steps/make_mfcc.sh --nj 20 --cmd "$train_cmd" data/train exp/make_mfcc/train $mfccdir +steps/make_mfcc.sh --nj 20 --cmd "$train_cmd" data/train exp/make_mfcc/train $mfccdir \endverbatim which is invoked by the top-level "run.sh" script. For the definitions of the shell variables, see that script. \$mfccdir is a user-specified directory where the -.ark files will be written. +.ark files will be written. The last file in the directory data/train is "cmvn.scp". This contains statistics for cepstral mean and variance normalization, indexed by speaker. Each set of statistics is a matrix, of dimension 2 by 14 in this case. In our example, we have: \verbatim -s5# head -3 data/train/cmvn.scp +s5# head -3 data/train/cmvn.scp 2001-A /home/dpovey/kaldi-trunk/egs/swbd/s5/mfcc/cmvn_train.ark:7 2001-B /home/dpovey/kaldi-trunk/egs/swbd/s5/mfcc/cmvn_train.ark:253 2005-A /home/dpovey/kaldi-trunk/egs/swbd/s5/mfcc/cmvn_train.ark:499 \endverbatim Unlike feats.scp, this scp file is indexed by speaker-id, not utterance-id. -This file is created by a command such as this: +This file is created by a command such as this: \verbatim -steps/compute_cmvn_stats.sh data/train exp/make_mfcc/train $mfccdir +steps/compute_cmvn_stats.sh data/train exp/make_mfcc/train $mfccdir \endverbatim (this example is from egs/swbd/s5/run.sh). -Because errors in data preparation can cause problems later on, we have a script to +Because errors in data preparation can cause problems later on, we have a script to check that the data directory is correctly formatted. Run e.g. \verbatim utils/validate_data_dir.sh data/train @@ -307,7 +307,7 @@ s5# ls data/lang_test G.fst L.fst L_disambig.fst oov.int oov.txt phones phones.txt topo words.txt \endverbatim Note that lang_test/ was created by copying lang/ and adding G.fst. -Each of these directories seems to contain only a few files. +Each of these directories seems to contain only a few files. It's not quite as simple as this though, because "phones" is a directory: \verbatim s5# ls data/lang/phones @@ -347,9 +347,9 @@ utils/int2sym.pl and utils/sym2int.pl, and by the OpenFst programs fstcompile an fstprint. The file L.fst is the Finite State Transducer form of the lexicon (L, -see "Speech Recognition -with Weighted Finite-State Transducers" by Mohri, Pereira and -Riley, in Springer Handbook on SpeechProcessing and Speech Communication, 2008). +see "Speech Recognition +with Weighted Finite-State Transducers" by Mohri, Pereira and +Riley, in Springer Handbook on SpeechProcessing and Speech Communication, 2008). with phone symbols on the input and word symbols on the output. The file L_disambig.fst is the lexicon, as above but including the disambiguation symbols \#1, \#2, and so on, as well as the self-loop with \#0 on it to "pass through" @@ -368,7 +368,7 @@ containing just a phone that we designate as a "garbage phone"; this phone will align with various kinds of spoken noise. In our particular setup, this phone is called \ (short for "spoken noise"): \verbatim -s5# grep -w UNK data/local/dict/lexicon.txt +s5# grep -w UNK data/local/dict/lexicon.txt SPN \endverbatim The file oov.int contains the integer form of this (extracted from words.txt), @@ -404,20 +404,20 @@ s5# cat data/lang/topo \endverbatim This specifies the topology of the HMMs we use. In this case, the "real" phones contain -three emitting states +three emitting states with the standard 3-state left-to-right topology-- the "Bakis model". (Emitting states are states that "emit" feature vectors, as distinct from the "fake" non-emitting states that are just used to glue other states together). Phones 1 to 20 are various kinds of silence and noise; we have a lot because of word-position-dependency, and in fact most of these will never be used; the real number excluding word position -dependency is more like five. The "silence phones" have a more complex topology with an +dependency is more like five. The "silence phones" have a more complex topology with an initial emitting state and an end emitting state, but then three emitting states in the middle. You don't have to create this file by hand. There are a number of files in data/lang/phones/ that specify various things about the phone set. Most of these files exist in three separate versions: a ".txt" form, e.g.: \verbatim -s5# head -3 data/lang/phones/context_indep.txt +s5# head -3 data/lang/phones/context_indep.txt SIL SIL_B SIL_E @@ -432,7 +432,7 @@ s5# head -3 data/lang/phones/context_indep.int and a ".csl" form, which in a slight abuse of notation, denotes a colon-separated list, not a comma-separated list: \verbatim -s5# cat data/lang/phones/context_indep.csl +s5# cat data/lang/phones/context_indep.csl 1:2:3:4:5:6:7:8:9:10:11:12:13:14:15:16:17:18:19:20 \endverbatim These files always contain the same information, so let's focus on the ".txt" form which @@ -474,7 +474,7 @@ lexicon (not part of a word), SIL_B would be a silence phone at the b (which should never exist), SIL_I word-internal silence (unlikely to exist), SIL_E word-ending silence (should never exist), and SIL_S would be silence as a "singleton word", i.e. a phone with only one word-- this might be used if you had a "silence word" in your -lexicon and explicit silences appear in your transcriptions. +lexicon and explicit silences appear in your transcriptions. The files silence.txt and nonsilence.txt contains lists of the silence phones and nonsilence phones respectively. These should be mutually exclusive and together, @@ -489,11 +489,11 @@ to designate all silence, noise and vocalized-noise phones as "silence" phones, phones representing traditional phonemes as "nonsilence" phones. We haven't experimented in Kaldi with the best way to do this. \verbatim -s5# head -3 data/lang/phones/silence.txt +s5# head -3 data/lang/phones/silence.txt SIL SIL_B SIL_E -s5# head -3 data/lang/phones/nonsilence.txt +s5# head -3 data/lang/phones/nonsilence.txt IY_B IY_E IY_I @@ -502,7 +502,7 @@ IY_I The file disambig.txt contains a list of the "disambiguation symbols" (see \ref graph_disambig): \verbatim -s5# head -3 data/lang/phones/disambig.txt +s5# head -3 data/lang/phones/disambig.txt #0 #1 #2 @@ -512,7 +512,7 @@ These symbols appear in the file phones.txt as if they were phones. The file optional_silence.txt contains a single phone which can optionally appear between words: \verbatim -s5# cat data/lang/phones/optional_silence.txt +s5# cat data/lang/phones/optional_silence.txt SIL \endverbatim The mechanism by which it appears optionally between words is that it appears @@ -527,7 +527,7 @@ rather than linguistically meaningful ones). In this particular setup, sets.txt groups together all the word-position-dependent versions of each phone: \verbatim -s5# head -3 data/lang/phones/sets.txt +s5# head -3 data/lang/phones/sets.txt SIL SIL_B SIL_E SIL_I SIL_S SPN SPN_B SPN_E SPN_I SPN_S NSN NSN_B NSN_E NSN_I NSN_S @@ -536,27 +536,27 @@ NSN NSN_B NSN_E NSN_I NSN_S The file extra_questions.txt contains some extra questions which we'll include in addition to the automatically generated questions: \verbatim -s5# cat data/lang/phones/extra_questions.txt -IY_B B_B D_B F_B G_B K_B SH_B L_B M_B N_B OW_B AA_B TH_B P_B OY_B R_B UH_B AE_B S_B T_B AH_B V_B W_B Y_B Z_B CH_B AO_B DH_B UW_B ZH_B EH_B AW_B AX_B EL_B AY_B EN_B HH_B ER_B IH_B JH_B EY_B NG_B -IY_E B_E D_E F_E G_E K_E SH_E L_E M_E N_E OW_E AA_E TH_E P_E OY_E R_E UH_E AE_E S_E T_E AH_E V_E W_E Y_E Z_E CH_E AO_E DH_E UW_E ZH_E EH_E AW_E AX_E EL_E AY_E EN_E HH_E ER_E IH_E JH_E EY_E NG_E -IY_I B_I D_I F_I G_I K_I SH_I L_I M_I N_I OW_I AA_I TH_I P_I OY_I R_I UH_I AE_I S_I T_I AH_I V_I W_I Y_I Z_I CH_I AO_I DH_I UW_I ZH_I EH_I AW_I AX_I EL_I AY_I EN_I HH_I ER_I IH_I JH_I EY_I NG_I -IY_S B_S D_S F_S G_S K_S SH_S L_S M_S N_S OW_S AA_S TH_S P_S OY_S R_S UH_S AE_S S_S T_S AH_S V_S W_S Y_S Z_S CH_S AO_S DH_S UW_S ZH_S EH_S AW_S AX_S EL_S AY_S EN_S HH_S ER_S IH_S JH_S EY_S NG_S -SIL SPN NSN LAU -SIL_B SPN_B NSN_B LAU_B -SIL_E SPN_E NSN_E LAU_E -SIL_I SPN_I NSN_I LAU_I -SIL_S SPN_S NSN_S LAU_S +s5# cat data/lang/phones/extra_questions.txt +IY_B B_B D_B F_B G_B K_B SH_B L_B M_B N_B OW_B AA_B TH_B P_B OY_B R_B UH_B AE_B S_B T_B AH_B V_B W_B Y_B Z_B CH_B AO_B DH_B UW_B ZH_B EH_B AW_B AX_B EL_B AY_B EN_B HH_B ER_B IH_B JH_B EY_B NG_B +IY_E B_E D_E F_E G_E K_E SH_E L_E M_E N_E OW_E AA_E TH_E P_E OY_E R_E UH_E AE_E S_E T_E AH_E V_E W_E Y_E Z_E CH_E AO_E DH_E UW_E ZH_E EH_E AW_E AX_E EL_E AY_E EN_E HH_E ER_E IH_E JH_E EY_E NG_E +IY_I B_I D_I F_I G_I K_I SH_I L_I M_I N_I OW_I AA_I TH_I P_I OY_I R_I UH_I AE_I S_I T_I AH_I V_I W_I Y_I Z_I CH_I AO_I DH_I UW_I ZH_I EH_I AW_I AX_I EL_I AY_I EN_I HH_I ER_I IH_I JH_I EY_I NG_I +IY_S B_S D_S F_S G_S K_S SH_S L_S M_S N_S OW_S AA_S TH_S P_S OY_S R_S UH_S AE_S S_S T_S AH_S V_S W_S Y_S Z_S CH_S AO_S DH_S UW_S ZH_S EH_S AW_S AX_S EL_S AY_S EN_S HH_S ER_S IH_S JH_S EY_S NG_S +SIL SPN NSN LAU +SIL_B SPN_B NSN_B LAU_B +SIL_E SPN_E NSN_E LAU_E +SIL_I SPN_I NSN_I LAU_I +SIL_S SPN_S NSN_S LAU_S \endverbatim You will observe that a question is simply a set of phones. The first four questions are asking about the word-position, for regular phones; and the last five do the same for the "silence phones". The "silence" phones also come in a variety without a suffix like _B, for example SIL. These may appear as optional silence in the lexicon, i.e. not inside an actual word. In setups with things like tone dependency or stress markings, extra_questions.txt -may contain questions that relate to those features. +may contain questions that relate to those features. The file word_boundary.txt explains how the phones relate to word positions: \verbatim -s5# head data/lang/phones/word_boundary.txt +s5# head data/lang/phones/word_boundary.txt SIL nonword SIL_B begin SIL_E end @@ -570,14 +570,14 @@ we don't like to hardcode this in the text form of the phones-- for one thing, K never see the text form of the phones, but only an integerized form. So it is specified by this file word_boundary.txt. The main reason we need this information is in order to recover the word boundaries within lattices (for example, the program -lattice-align-words reads the integer versin of this file, word_boundaray.int). +lattice-align-words reads the integer versin of this file, word_boundaray.int). Finding the word boundaries is useful for reasons including NIST sclite scoring, which requires the time markings for words, and for other downstream processing. The file roots.txt contains information that relates to how we build the phonetic-context decision tree: \verbatim -head data/lang/phones/roots.txt +head data/lang/phones/roots.txt shared split SIL SIL_B SIL_E SIL_I SIL_S shared split SPN SPN_B SPN_E SPN_I SPN_S shared split NSN NSN_B NSN_E NSN_I NSN_S @@ -607,7 +607,7 @@ utils/prepare_lang.sh data/local/dict "" data/local/lang data/lang \endverbatim Here, the inputs are the directory data/local/dict/, and the label \ which is the dictionary word we will map OOV words to when appear in transcripts -(this becomes data/lang/oov.txt). The location data/local/lang/ is simply a +(this becomes data/lang/oov.txt). The location data/local/lang/ is simply a temporary directory which the script will use; data/lang/ is where it actually puts its output. @@ -617,21 +617,21 @@ The thing which you, as the data-preparer, need to create, is the directory s5# ls data/local/dict extra_questions.txt lexicon.txt nonsilence_phones.txt optional_silence.txt silence_phones.txt \endverbatim -(in fact there are a few more files there which we haven't listed, but they are just temporary files that +(in fact there are a few more files there which we haven't listed, but they are just temporary files that were put there while creating that directory, and we can ignore them). The commands below give you an idea what is in these files: \verbatim -s5# head -3 data/local/dict/nonsilence_phones.txt +s5# head -3 data/local/dict/nonsilence_phones.txt IY B D -s5# cat data/local/dict/silence_phones.txt +s5# cat data/local/dict/silence_phones.txt SIL SPN NSN LAU -s5# cat data/local/dict/extra_questions.txt -s5# head -5 data/local/dict/lexicon.txt +s5# cat data/local/dict/extra_questions.txt +s5# head -5 data/local/dict/lexicon.txt !SIL SIL -'S S -'S Z @@ -650,7 +650,7 @@ on separate lines, if we have multiple pronunciations for it. If you want to use pronunciation probabilities, instead of creating the file lexicon.txt, create a file called lexiconp.txt that has the probability as the second field. -Note that it is a common practice to normalize the pronunciations probabilities so that +Note that it is a common practice to normalize the pronunciations probabilities so that instead of summing to one, the most probable pronunciation for each word is one. This tends to give better results. For a top-level script that runs with pronunciation probabilities, search for pp in egs/wsj/s5/run.sh. @@ -666,35 +666,35 @@ versions of a particular phone that have different stress or tone. In order to demonstrate what this looks like, we'll view the same files as above, but in the egs/wsj/s5/ setup. The result is below: \verbatim -s5# cat data/local/dict/silence_phones.txt +s5# cat data/local/dict/silence_phones.txt SIL SPN NSN -s5# head data/local/dict/nonsilence_phones.txt -S -UW UW0 UW1 UW2 -T -N -K -Y -Z -AO AO0 AO1 AO2 -AY AY0 AY1 AY2 -SH -s5# head -6 data/local/dict/lexicon.txt +s5# head data/local/dict/nonsilence_phones.txt +S +UW UW0 UW1 UW2 +T +N +K +Y +Z +AO AO0 AO1 AO2 +AY AY0 AY1 AY2 +SH +s5# head -6 data/local/dict/lexicon.txt !SIL SIL SPN SPN NSN !EXCLAMATION-POINT EH2 K S K L AH0 M EY1 SH AH0 N P OY2 N T "CLOSE-QUOTE K L OW1 Z K W OW1 T -s5# cat data/local/dict/extra_questions.txt -SIL SPN NSN -S UW T N K Y Z AO AY SH W NG EY B CH OY JH D ZH G UH F V ER AA IH M DH L AH P OW AW HH AE R TH IY EH -UW1 AO1 AY1 EY1 OY1 UH1 ER1 AA1 IH1 AH1 OW1 AW1 AE1 IY1 EH1 -UW0 AO0 AY0 EY0 OY0 UH0 ER0 AA0 IH0 AH0 OW0 AW0 AE0 IY0 EH0 -UW2 AO2 AY2 EY2 OY2 UH2 ER2 AA2 IH2 AH2 OW2 AW2 AE2 IY2 EH2 -s5# +s5# cat data/local/dict/extra_questions.txt +SIL SPN NSN +S UW T N K Y Z AO AY SH W NG EY B CH OY JH D ZH G UH F V ER AA IH M DH L AH P OW AW HH AE R TH IY EH +UW1 AO1 AY1 EY1 OY1 UH1 ER1 AA1 IH1 AH1 OW1 AW1 AE1 IY1 EH1 +UW0 AO0 AY0 EY0 OY0 UH0 ER0 AA0 IH0 AH0 OW0 AW0 AE0 IY0 EH0 +UW2 AO2 AY2 EY2 OY2 UH2 ER2 AA2 IH2 AH2 OW2 AW2 AE2 IY2 EH2 +s5# \endverbatim You may notice that some of the lines in nonsilence_phones.txt contain multiple phones on a single line. These are the different stress-dependent @@ -722,8 +722,8 @@ of the stress-dependent versions of phones may have too little data to robustly estimate either a separate decision tree or the phone clustering information that's used in producing the questions. By grouping them together like this, we ensure that in the absence of enough data to estimate them -separately, these different versions of the phone all "stay together" throughout -the decision-tree building process. +separately, these different versions of the phone all "stay together" throughout +the decision-tree building process. We should mention at this point that the script utils/prepare_lang.sh supports a number of options. To give you an idea of what they are, here is @@ -731,7 +731,7 @@ the usage messages of that script: \verbatim usage: utils/prepare_lang.sh e.g.: utils/prepare_lang.sh data/local/dict data/local/lang data/lang -options: +options: --num-sil-states # default: 5, #states in silence models. --num-nonsil-states # default: 3, #states in non-silence models. --position-dependent-phones (true|false) # default: true; if true, use _B, _E, _S & _I @@ -778,7 +778,7 @@ local/make_rm_lm.pl $RMROOT/rm1_audio1/rm1/doc/wp_gram.txt > $tmpdir/G.txt || e This script local/make_rm_lm.pl creates a grammar in FST format (text format, not binary format). It contains lines like the following: \verbatim -s5# head data/local/tmp/G.txt +s5# head data/local/tmp/G.txt 0 1 ADD ADD 5.19849703126583 0 2 AJAX+S AJAX+S 5.19849703126583 0 3 APALACHICOLA+S APALACHICOLA+S 5.19849703126583 @@ -788,7 +788,7 @@ have a useful tutorial). The script local/rm_prepare_grammar.sh will the binary-format file G.fst using the following statement: \verbatim fstcompile --isymbols=data/lang/words.txt --osymbols=data/lang/words.txt --keep_isymbols=false \ - --keep_osymbols=false $tmpdir/G.txt > data/lang/G.fst + --keep_osymbols=false $tmpdir/G.txt > data/lang/G.fst \endverbatim If you want to create your own grammar, you will probably want to do something similar. Note: this type of procedure only applies to grammars of a certain class: it won't @@ -797,7 +797,13 @@ in OpenFst format. There are ways to do this in the WFST framework (e.g. see recent work by Mike Riley with push down transducers), but we have not yet worked with those ideas in Kaldi. -In the WSJ setup, we use a statistical language model. The script local/wsj_format_data.sh +Please, before asking any questions on the list about language models or about making +grammar FSTs, read "A Bit of Progress in Language Modeling" by Joshua Goodman; and go to +www.openfst.org and do the FST tutorial so that you understand the basics of finite +state transducers. (Note that language models would be represented as finite state +acceptors, or FSAs, which can be considered as a special case of finite state transducers). + +In the WSJ setup (like most setups), we use a statistical language model. The script local/wsj_format_data.sh deals with converting the ARPA-format language models supplied with the WSJ database, into an OpenFst format. Some of the key commands from that script are: \verbatim @@ -832,5 +838,9 @@ FST from text form into OpenFst binary form; and fstrmepsilon is also an OpenFst command, which removes the small number of \ symbols from the FST, which were converted from \ and \. +A popular toolkit for building language models is SRILM. Various language +modeling toolkits are used in the Kaldi example scripts. SRILM is the best +documented and most fully featured, and we generally recommend it (its only +drawback is that it don't have the most free licence). */ diff --git a/src/doc/dnn.dox b/src/doc/dnn.dox index 25b6e8e19cf..5b3d2b98261 100644 --- a/src/doc/dnn.dox +++ b/src/doc/dnn.dox @@ -76,6 +76,7 @@ namespace kaldi { - Documentation for Karel's version is available at \subpage dnn1 - Documentation for Dan's old version is available at \subpage dnn2. - Documentation for the nnet3 setup is available at \subpage dnn3. + - Documentation for the 'nnet3+chain' setup is available at \subpage chain. */ diff --git a/src/doc/dnn3_code_data_types.dox b/src/doc/dnn3_code_data_types.dox index f72721e1715..30623e6c658 100644 --- a/src/doc/dnn3_code_data_types.dox +++ b/src/doc/dnn3_code_data_types.dox @@ -26,7 +26,7 @@ namespace nnet3 { - Up: \ref dnn3 - Next: \ref dnn3_code_compilation - + \section dnn3_dt_problem Objectives and background The previous \ref dnn1 "nnet1" and \ref dnn2 "nnet2" setups are based on a Component @@ -89,7 +89,7 @@ output-node name=output input=output_nonlin with a number of additional indexes: time (t), an index (n) that indicates the example within the minibatch (e.g. 0 through 511 for a 512-example minibatch), plus an "extra" index (x) that may eventually be - useful in convolutional approaches but is usually zero for now. + useful in convolutional approaches but is usually zero for now. To formalize the above, we define an Index is a tuple (n, t, x). We will also define a \ref Cindex as a tuple (node-index, Index), where the node-index is @@ -108,7 +108,7 @@ output-node name=output input=output_nonlin for receiving matrix-valued input, evaluating the NnetComputation, and supplying matrix-valued output. Think of this as the run-time of a very limited interpreted language. - + \section dnn3_dt_data_structures Basic data structures in nnet3 \subsection dnn3_dt_datastruct_index Indexes @@ -149,13 +149,13 @@ output-node name=output input=output_nonlin \verbatim [ (0, -1:1) (1, -1:1) ... ] \endverbatim - - + + \subsection dnn3_dt_datastruct_cindex Cindexes A \ref Cindex is a pair (int32, Index), where the int32 corresponds to the index of a node in a neural network. As mentioned above, a \ref Nnet "neural network" consists of a collection of - named Components and a kind of graph on "nodes", and the nodes have indexes. + named Components and a kind of graph on "nodes", and the nodes have indexes. Cindexes are used during the compilation process, and they correspond to the nodes of a "computation graph" corresponding to a specific neural net computation. There is a correspondence @@ -217,8 +217,8 @@ output-node name=output input=output_nonlin Computation acts on are a list of matrices, and also submatrices that may occupy row or column ranges of a matrix. A Computation also contains various sets of indexes (arrays of integers and so on) that are sometimes required as - arguments to particular matrix operations. - + arguments to particular matrix operations. + We will describe this in more detail below in \ref dnn3_dt_nnet_computation. \subsection dnn3_dt_data_struct_computer NnetComputer @@ -229,7 +229,7 @@ output-node name=output input=output_nonlin the NnetComputation. - \section dnn3_dt_nnet Neural networks in nnet3 + \section dnn3_dt_nnet Neural networks in nnet3 The previous section should have given you a high-level overview of how the framework fits together. In this section we will go into a little more detail @@ -251,7 +251,7 @@ class Component { virtual void Backprop(const std::string &debug_info, const ComponentPrecomputedIndexes *indexes, const CuMatrixBase &in_value, - const CuMatrixBase &out_value, + const CuMatrixBase &out_value, const CuMatrixBase &out_deriv, Component *to_update, // may be NULL; may be identical // to "this" or different. @@ -266,7 +266,7 @@ class Component { to create the corresponding row of the output. In terms of Indexes, this means that the Indexes corresponding to each element of input and output are the same. Similar logic holds in the Backprop function. - + \subsection dnn3_dt_nnet_component_properties Components (properties) @@ -295,7 +295,7 @@ class Component { standard methods (RNNs, LSTMs and so on). Unlike in the \ref dnn2 "nnet2" framework, Components are not responsible for implementing things like splicing across frames; instead we use \ref dnn3_dt_nnet_descriptor_code "Descriptors" to handle that, as will be explained below. - + \subsection dnn3_dt_nnet_node_outline Neural network nodes (outline) @@ -349,7 +349,13 @@ Each component-node in the config file gets expanded to two nodes: a node of type kComponent, and an immediately preceding node of type kDescriptor that is defined by the "input" field. - +The config file above doesn't give an example of a dim-range node. The basic format +of a dim-range node is this (this example would take the first 50 dimensions from the 65 dimensions +of component affine1): +\verbatim +dim-range-node name=dim-range-node1 input-node=affine1_node dim-offset=0 dim=50 +\endverbatim + \subsection dnn3_dt_nnet_descriptor_code Descriptors in config files A Descriptor is a very limited type of expression that refers to quantities defined @@ -366,8 +372,8 @@ defined by the "input" field. \verbatim # caution, this is a simplification that overgenerates descriptors. ::= ;; node name of kInput or kComponent node. - ::= Append(, [, ... ] ) - ::= Sum(, ) + ::= Append(, [, ... ] ) + ::= Sum(, [, ...]) ;; Failover or IfDefined might be useful for time t=-1 in a RNN, for instance. ::= Failover(, ) ;; 1st arg if computable, else 2nd ::= IfDefined() ;; the arg if defined, else zero. @@ -386,10 +392,12 @@ defined by the "input" field. ::= ReplaceIndex(, , ) \endverbatim - -Now we will describe the actual syntax, which differs from the above simplified -version because expressions may appear only in a certain hierarchy. This -syntax also corresponds more closely with the class names in the real code. +Now we will describe the actual syntax which the code uses internally, which +differs from the above simplified version because expressions may appear only in +a certain hierarchy. This syntax also corresponds more closely with the class +names in the real code. The code that reads Descriptors attempts to normalize +them in as general as possible a way, so that almost all of the above syntax +can be read and converted to the internal representation. \verbatim ;;; == class Descriptor ::= Append([, ... ] ) @@ -477,7 +485,7 @@ If the Descriptor is computable for this Index, the function will return true. For instance, the expression Sum(X, Y) would only be computable if X and Y are computable. If this function is going to return true, it will also append to "input_terms" only the input Cindexes that -actually appear in the evaluated expression. +actually appear in the evaluated expression. For example (and speaking loosely), in an expression of the form Failover(X, Y), if X is computable then only X would be appended to "input_terms", and not Y. @@ -491,7 +499,7 @@ and \ref Descriptor::IsComputable() "IsComputable()" with the same interface as SumDescriptor, and also functions such as \ref Descriptor::NumParts() "NumParts()" and \ref Descriptor::Part() "Part(int32 n)" that allow the user to access the individual SumDescriptors in its vector. - + \subsection dnn3_dt_nnet_node_detail Neural network nodes (detail) We will now describe neural network nodes in more detail. As mentioned above, @@ -502,9 +510,9 @@ enum NodeType { kInput, kDescriptor, kComponent, kDimRange }; The actual NetworkNode is a struct. To avoid the hassle of pointers and because C++ doesn't allow unions containing classes, we have a slightly messy layout: \verbatim -struct NetworkNode { +struct NetworkNode { NodeType node_type; - // "descriptor" is relevant only for nodes of type kDescriptor. + // "descriptor" is relevant only for nodes of type kDescriptor. Descriptor descriptor; union { // For kComponent, the index into Nnet::components_ @@ -537,7 +545,7 @@ public: ... private: std::vector component_names_; - std::vector components_; + std::vector components_; std::vector node_names_; std::vector nodes_; @@ -558,7 +566,7 @@ information necessary to interpret them. Internally it defines a number of type including the following enum value: \verbatim enum CommandType { - kAllocMatrixUndefined, kAllocMatrixZeroed, + kAllocMatrixUndefined, kAllocMatrixZeroed, kDeallocMatrix, kPropagate, kStoreStats, kBackprop, kMatrixCopy, kMatrixAdd, kCopyRows, kAddRows, kCopyRowsMulti, kCopyToRowsMulti, kAddRowsMulti, kAddToRowsMulti, @@ -591,9 +599,9 @@ restricted row and column range of a matrix, like the matlab syntax }; struct SubMatrixInfo { int32 matrix_index; // index into "matrices": the underlying matrix. - int32 row_offset; + int32 row_offset; int32 num_rows; - int32 col_offset; + int32 col_offset; int32 num_cols; }; \endverbatim diff --git a/src/doc/examples.dox b/src/doc/examples.dox new file mode 100755 index 00000000000..7ba7a6043f3 --- /dev/null +++ b/src/doc/examples.dox @@ -0,0 +1,404 @@ +// doc/examples.dox + +// Copyright 2016 Fred Richardson Allen Guo + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at + +// http://www.apache.org/licenses/LICENSE-2.0 + +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + +/** + \page examples Examples included with Kaldi + + When you check out the Kaldi source tree (see \ref install), you will find many + sets of example scripts in the egs/ directory. This table summarizes some key + facts about some of those example scripts; however, it it not an exhaustive + list. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameBWLangTrain DomainTrain HoursTrain SpeakersLicense and AvailabilityYear ReleasedSpeech StyleTest DomainKaldi Aprox PerfModel TypeLM DataLexicon
AMI 16k English
(+non-native)
Microphone: head-mike,
single and multiple
distance mikes
100 123 M
66 F
Free /
Download
http://groups.inf.ed.ac.uk/ami/corpus/
2014 Meeting room Same as train
no overlap(?)
~25% WER head (T)DNN
~45% WER distant (B)LSTM
AMI + (opt) Fisher 50K (CMU dict +
kaldi sources)
Aspire English Conversational microphone
developed on telephone
see Fisher 2015 30.8% WER (dev or eval?)
WSJ 16k English Clean close-mic
read speech
80 LDC
LDC93S6B (WSJ0) and LDC94S13B (WSJ1)
1993 Read speech Same 6-7% WER same as train 20k (CMU dict)
RM English read transcript
limited vocab and grammar
LDC
LDC93S3A
1987-1989 read speech same 1-2% WER predefined grammar <1K
RM dict
Timit 16k English read transcript
very limited grammar
630 1986 read speech same ~30-40% PER none ~47 phones
fisher_english 8k English Telephone speech
Auto-transcribed
(errorful transcriptions)
1,600 5203 M
7198 F
LDC
speech: LDC2004S13, LDC2005S13
transcript: LDC2004T19, LDC2005T19
2004/2005 CTS Fisher (may
overlap witb
train)
~22% WER (DNN) LDC Fisher CMU dict
Size UNK
Switchboard 1 8k English CTS 300 LDC
Train: LDC97S62
Mississippi State transcriptions
Eval: LDC2002S09 and LDC2002T43
1993/1997/2000 CTS CTS
eval2000 (hub5)
~10% WER (LSTM) Mississippi Trans
+ (opt) Fisher
30K (CMU dict)
Switchboard 1
+ Fisher
8k English CTS see above see above see above see above CTS eval2000
rt03
~12% eval2000
~19% rt03
see above see above
Callhome
Egyptian
Egyptian
Colloquial
Arabic
CTS 120 conv LDC
Speech : LDC97S45
Transcripts : LDC97T19
Lexicon : LDC99L22
1997 CTS hub5 arabic
LDC2002S22
LDC2002T39
50-60% WER Train trans LDC dict
Corpus of
Spontaneous
Japanese
Japanese Mixed style
Close-talking mic
650 hours
(240 hr train)
>1,400 Unclear how to get this
http://www.ninjal.ac.jp/english/products/csj/
http://pj.ninjal.ac.jp/corpus_center/csj/
2004 Mixed 9-10% WER UNK UNK
Fisher Spanish
Callhome Spanish
Caribbean
Spanish
CTS Fisher: 163 hrs
Callhome: 60 hrs?
120 30min conv
Fisher: 136
Callhome:
LDC
Fisher speech : LDC96S35
Fisher transcripts : LDC96T17
Callhome Speech : LDC96S35
Callhome Transcripts : LDC96T17
Fisher: 2010
Callhome: 1996
CTS Kaldi subset
of Fisher
29-30% WER Fisher trans LDC96L16
Gale Arabic
Phase 2
16K Arabic Broadcast
Conversational/Report
320 train
9.3 test
LDC2013S02 LDC2014S07
LDC2013S07 LDC2014T17
LDC2013T17
LDC2013T04
Collected
2006/2007
Broadcast
Conversational
and Report
Report: 13% WER (LSTM)
Conver: 28% WER (LSTM)
Comb: 24% WER (LSTM)
LDC2013T17
LDC2013T04
LDC2014T17
http://alt.qcri.org/
Gale Mandarin 16K Mandarin
Chinese
Broadcast 126 LDC2013S08 LDC2013T20 2006-2007 Broadcast Same as train 17.5% WER [1] LDC2013S08
LDC2013T20
Same as HKUST below
hkust
EARS RT04F data
dev and train [2]
8K Mandarin
Chinese
Telephone Conversational ~145 ~873 LDC2005S15 LDC2005T32 2004 Conversational Same as train 33.5% CER Acoustic trans
(very little)
Both Eng and Man.
CMU dict use for Eng
mdbg dict use for Man
http://www.mdbg.net
librispeech [3] 16K English Read transcription 100 - 960
(460
F: 125-1128
M: 126-1167
http://www.openslr.org/12/ 2015 Read trans Librispeech
~5% Large (books) cmu (with sequitur)
G2P)
reverb
sprakbanken Danish Read transcript? 350 Free download
http://www.nb.no/sprakbanken/#ticketsfrom?lang=en
2012 Read/Dictation Same as train 14% WER NST Provided NST Provided?
vystadial_en [4] 8Khz English Telephone, dialog system 41 unk Free 2014 Dialog sys Same as train ~11% WER (GMM/HMM) Train trans CMU + 250
vystadial_cz [4] 8Khz Czech Telephone, dialog system 15 unk Free 2014 Dialog sys Same as train ~50% WER (GMM/HMM) Train trans Rule derived
chime3 16Khz English Read trans, simulated
and real noise
18 WSJ0 + 4 Not clear (Chime performers) 2015 Read
transcript
Same as train
(same channels!)
~12% WER real (4 spkrs)
~12% WER simu
Official WSJ0 5K
trans
WSJ0
voxforge 16Khz English Read trans >75hrs unk Free GPL 2008? Read trans unk unk Train cmu + g2p for oov
Tedlium 16KHz English Presentation/talk 118 666 Free download 2014? Presentation Same as train ~10% WER Cantab provided LM Cantab provided dict
+ +[1] "Audio Augmentation for Speech Recognition" Tom Ko, Vijayaditya Peddinti, Daniel Povey, Sanjeev Khudanpur.
+[2] There should be more Mandarin data from rt04f - 50 hours of dev data I believe (see LDC2004E67, LDC2004E68). There should also be eval data. See https://www.ldc.upenn.edu/collaborations/past-projects/gale/data/gale-pubs.
+[3] See http://www.danielpovey.com/files/2015_icassp_librispeech.pdf for details. Acoustic and language models are available online.
+[4] See http://www.lrec-conf.org/proceedings/lrec2014/pdf/535_Paper.pdf. +*/ diff --git a/src/doc/glossary.dox b/src/doc/glossary.dox index ba42ea12370..31fa62d3389 100644 --- a/src/doc/glossary.dox +++ b/src/doc/glossary.dox @@ -26,7 +26,7 @@ search function of your browser. For convenience the definition of each term's section is preceded and followed by a colon, so for instance, typing ctrl-f ":lattice:" would take you to the section for "lattice". - +

@@ -37,7 +37,7 @@ synonymous with a sequence of transition-ids. Most of the time an alignment is derived from aligning the reference transcript of an utterance, in which case it is called a forced alignment. lattices also contain alignment information as sequences of transition-ids for each word -sequence in the lattice. The program \ref bin/show-alignments.cc "show-alignments" shows +sequence in the lattice. The program \ref bin/show-alignments.cc "show-alignments" shows alignments in a human-readable format. :forced alignment: see alignment. @@ -54,6 +54,18 @@ of the HMMs, and also various other important integer mappings; see \ref transit This object is generally written at the start of model files. The program \ref bin/show-transitions.cc "show-transitions" shows these. +:G.fst: The grammar FST G.fst which lives in the + data/lang/ directory in the scripts (see \ref data_prep_lang) represents + the language model in a Finite State Transducer format (see www.openfst.org). + For the most part it is an acceptor, meaning the input and output symbols on the + arcs are the same, but for statistical language models with backoff, the backoff + arcs have the "disambiguation symbol" #0 on the input side only. + For many purposes you'll want to get rid of the disambiguation symbols + using the command fstproject --project_output=true. The disambiguation symbols + are needed during graph compilation to make the FST determinizable, but for things + like language-model rescoring you don't want them. + +
*/ diff --git a/src/doc/history.dox b/src/doc/history.dox index a3cb6d6fe27..bf114a3a9e0 100644 --- a/src/doc/history.dox +++ b/src/doc/history.dox @@ -55,13 +55,13 @@ and documentation); we were visited by Michael Riley (who helped us to understand OpenFst and gave some lectures on FSTs), and would like to acknowledge the help of Honza Cernocky (for allowing us to have the workshop and helping to organize it), - Renata Kohlova (administration), and Tomas Kasparek (system administration). + Renata Kohlova (administration), and Tomas Kasparek (system administration). It is possible that this list of contributors contains oversights; any important omissions are unlikely to be intentional. A lot of code was written during the summer of 2010 but we still did not have a complete working system. Some of the participants of the 2010 workshop - continued working to complete the toolkit and get a working set of training scripts. + continued working to complete the toolkit and get a working set of training scripts. The code was released on May 14th, 2011. Since the initial release, Kaldi has been maintained and developed to a large @@ -95,9 +95,15 @@ for his help in organizing the JHU'09 workshop and with the Wall Street Journal recipe. We would also like to acknowledge the help of faculty and staff at Johns Hopkins University's Center for Language and - Speech Processing during the JHU'09 workshop: particularly + Speech Processing during the JHU'09 workshop: particularly Sanjeev Khudanpur, Desiree Cleves and the late Fred Jelinek. + Since 2012, Kaldi development has received significant support from IARPA's + BABEL program (IARPA-BAA-11-02) and from the Human Language Technology + Center of Excellence (HLTCOE); and since 2015, from the NSF computing + research infrastructure (CRI) award ``CI-EN: Enhancements for the Kaldi Speech + Recognition Toolkit''. + Sanjeev Khudanpur deserves special mention for creating the conditions for the Kaldi project to succeed, first at the JHU'09 workshop where in his role as workshop organizer he was instrumental in putting the team together diff --git a/src/doc/hmm.dox b/src/doc/hmm.dox index 9935fa52711..938321fd7b2 100644 --- a/src/doc/hmm.dox +++ b/src/doc/hmm.dox @@ -447,9 +447,10 @@ We now explain what these three scales do: when we add the self-loop, let the probability mass given to the self-loop be p and the mass given to the rest be (1-p). We add a self-loop with log-probability self_loop_scale * log(p), and add (self_loop_scale * log(1-p)) to all the other - log transition probabilities - out of that state. In typical topologies, the self-loop scale is the only scale - that matters. + log transition probabilities out of that state. (Note: in the initial stage of + graph creation we create a graph without self-loops, and with the non-self-loop + transition probabilities renormalized to sum to one). In typical topologies, the + self-loop scale is the only scale that matters. The reason we feel it might make sense to apply a different probability scale to the self-loops versus the normal transition scale is we think they could be diff --git a/src/doc/install.dox b/src/doc/install.dox index 0ffb2b1220f..b40b139a8dc 100644 --- a/src/doc/install.dox +++ b/src/doc/install.dox @@ -29,8 +29,8 @@ possibly including unfinished and experimental features, can be downloaded by typing into a shell: \verbatim - git clone https://github.com/kaldi-asr/kaldi.git kaldi-trunk --origin golden - cd kaldi-trunk + git clone https://github.com/kaldi-asr/kaldi.git kaldi --origin upstream + cd kaldi \endverbatim If you want to get updates and bug fixes you can go to some checked-out directory, and type diff --git a/src/doc/mainpage.dox b/src/doc/mainpage.dox index 1665607b330..7bedc25ef13 100644 --- a/src/doc/mainpage.dox +++ b/src/doc/mainpage.dox @@ -45,6 +45,7 @@ - \subpage dependencies - \subpage legal - \subpage tutorial + - \subpage examples - \subpage glossary - \subpage data_prep - \subpage build_setup @@ -75,6 +76,7 @@ - \ref dnn1 - \ref dnn2 - \ref dnn3 + - \ref chain - \subpage online_decoding - \subpage kws - \subpage queue diff --git a/src/doc/online_decoding.dox b/src/doc/online_decoding.dox index e03d350e308..97c81dd0bcc 100644 --- a/src/doc/online_decoding.dox +++ b/src/doc/online_decoding.dox @@ -40,11 +40,11 @@ namespace kaldi { deprecated, and may eventually be removed from the trunk (but remain in ^/branches/complete). - There is some documentation for the older setup \ref online_programs "here", + There is some documentation for the older setup \ref online_programs "here", but we recommend to read this page first. \section online_decoding_scope Scope of online decoding in Kaldi - + In Kaldi we aim to provide facilities for online decoding as a library. That is, we aim to provide the functionality for online decoding but not necessarily command-line tools for it. The reason is, different @@ -59,7 +59,7 @@ namespace kaldi { neural net models (see section \ref online_decoding_nnet2). \section GMM-based online decoding - + The program online2-wav-gmm-latgen-faster.cc is currently the primary example program for the GMM-based online-decoding setup. It reads in whole wave files but internally it processes them chunk by chunk with no dependency on the future. In the example script egs/rm/s5/local/online/run_gmm.sh @@ -68,7 +68,7 @@ namespace kaldi { procedure within a typical batch-processing framework, so that you can easily evaluate word error rates. We plan to add similar programs for SGMMs and DNNs. In order to actually do online decoding, you would have to modify this program. - We should note (and this is obvious to speech recognition people but not to outsiders) + We should note (and this is obvious to speech recognition people but not to outsiders) that the audio sample rate needs to exactly match what you used in training (and oversampling won't work but subsampling will). @@ -77,11 +77,11 @@ namespace kaldi { In Kaldi, when we use the term "decoder" we don't generally mean the entire decoding program. We mean the inner decoder object, generally of the type LatticeFasterDecoder. This object takes the decoding graph (as an FST), and the decodable object - (see \ref decodable_interface). All the decoders naturally support online decoding; it + (see \ref decodable_interface). All the decoders naturally support online decoding; it is the code in the decoding program (but outside of the decoder) that needs to change. We should note, though, a difference in how you need to invoke the decoder for online decoding. - - In the old online-decoding setup (in online/), if "decoder" is some decoder + - In the old online-decoding setup (in online/), if "decoder" is some decoder (e.g. of type LatticeFasterDecoder) and "decodable" is a decodable object of a suitable type, you would call decoder.Decode(&decodable), and this call would block until the input was finished (because the decoder @@ -90,16 +90,16 @@ namespace kaldi { decoder.InitDecoding(), and then each time you get more feature data, you would call decoder.AdvanceDecoding(). For offline use, you can still call Decode(). - + We should mention here that in the old online setup, there is a decoder called OnlineFasterDecoder. Do not assume from the name of this that it is the only decoder to support online decoding. The special thing about the OnlineFasterDecoder is that it has the ability to work out which words are going to be "inevitably" decoded regardless of what audio data comes in in future, so you can output those words. This is useful in an online-transcription context, and if there seems to - be a demand for this, we may move that decoder from online/ into the decoder/ - directory and make it compatible with the new online setup. - + be a demand for this, we may move that decoder from online/ into the decoder/ + directory and make it compatible with the new online setup. + \section online_decoding_feature Feature extraction in online decoding @@ -113,14 +113,14 @@ namespace kaldi { (OnlineFeatureInterface::GetFrame()) and how it says how many frames are ready (OnlineFeatureInterface::NumFramesReady()), but does not say how it obtains those features. That is up to the child class. - + In online-feature.h we define classes OnlineMfcc and OnlinePlp which - are the lowest-level features. They have a member function + are the lowest-level features. They have a member function OnlineMfccOrPlp::AcceptWaveform(), which the user should call when data is captured. All the other online feature types in online-feature.h are "derived" features, so they take an object of OnlineFeatureInterface in their constructor and get their input features through a stored pointer - to that object. + to that object. The only part of the online feature extraction code in online-feature.h that is non-trivial is the cepstral mean and variance normalization (CMVN) @@ -143,7 +143,7 @@ namespace kaldi { In the Kaldi scripts, cepstral mean and variance normalization (CMVN) is generally done on a per-speaker basis. Obviously in an online-decoding context, this is impossible to do because it is "non-causal" (the current - feature depends on future features). + feature depends on future features). The basic solution we use is to do "moving-window" cepstral mean normalization. We accumulate the mean over a moving window of, by default, 6 @@ -178,7 +178,7 @@ namespace kaldi { using a method called basis-fMLLR (again, see below) where we incrementally estimate the parameters, and it is not completely invariant to offsets. - + \section online_decoding_adaptation Adaptation in online decoding The most standard adaptation method used for speech recognition is @@ -187,13 +187,13 @@ namespace kaldi { code and documentation. fMLLR consists of an affine (linear + offset) transform of the features; the number of parameters is d * (d+1), where d is the final feature dimension (typically 40). In the online decoding program - a basis method to incrementally estimate an increasing number of + a basis method to incrementally estimate an increasing number of transform parameters as we decode more data. The top-level logic for this at the decoder level is mostly implemented in class SingleUtteranceGmmDecoder. - + The fMLLR estimation is done not continuously but periodically, since it involvesa computing lattice posteriors and this can't very easily be done in a continuous - manner. Configuration variables in class OnlineGmmDecodingAdaptationPolicyConfig + manner. Configuration variables in class OnlineGmmDecodingAdaptationPolicyConfig determine when we re-estimate fMLLR. The default currently is, during the first utterance, to estimate it after 2 seconds, and thereafter at times in a geometrically increasing ratio with constant 1.5 (so at 2 seconds, 3 seconds, 4.5 seconds...). @@ -202,11 +202,11 @@ namespace kaldi { Note that the CMN adaptation state is frozen, as mentioned above, the first time we estimate fMLLR for a speaker, which by default will be two seconds into the - first utterance. + first utterance. \section online_decoding_models Use of multiple models in GMM-based online decoding - In the online decoding decode for GMMs in online-gmm-decoding.h, up to three + In the online decoding decode for GMMs in online-gmm-decoding.h, up to three models can be supplied. These are held in class OnlineGmmDecodingModels, which takes care of the logic necessary to decide which model to use for different purposes if fewer models are supplied. The three models are: @@ -215,12 +215,12 @@ namespace kaldi { - A speaker adapted model, trained with fMLLR - A discriminatively trained version of the speaker adapted model It is our practice to use a Maximum Likelihood estimated model to estimate - adaptation parameters, as this is more consistent with the Maximum framework + adaptation parameters, as this is more consistent with the Maximum framework than using a discriminatively trained model, although this probably makes little difference and you would lose little (and save some memory) by using the discriminatively - trained model for this purpose. + trained model for this purpose. + - \section online_decoding_nnet2 Neural net based online decoding with iVectors Our best online-decoding setup, which we recommend should be used, is the neural @@ -245,31 +245,31 @@ namespace kaldi { example setups, e.g. in egs/rm/s5, egs/wsj/s5, egs/swbd/s5b, and egs/fisher_english/s5. The top-level example script is always called local/online/run_nnet2.sh. In the case of the Resource Management recipe there is also a script local/online/run_nnet2_wsj.sh. This demonstrates - how to take a larger neural net trained on out-of-domain speech with the same sampling rate (in + how to take a larger neural net trained on out-of-domain speech with the same sampling rate (in this example, WSJ), and retrain it on in-domain data. In this way we obtained our best-ever results on RM. We are currently working on example scripts for discriminative training for this setup. \subsection online_decoding_nnet2_example Example for using already-built online-nnet2 models - + In this section we will explain how to download already-build online-nnet2 models from www.kaldi-asr.org and evaluate them on your own data. - The reader can download the models and other relating files from - http://kaldi-asr.org/downloads/build/2/sandbox/online/egs/fisher_english/s5 , + The reader can download the models and other relating files from + http://kaldi-asr.org/downloads/build/2/sandbox/online/egs/fisher_english/s5 , which are built using the fisher_english recipe. To use the online-nnet2 models, the reader - only needs to download two directories: exp/tri5a/graph and exp/nnet2_online/nnet_a_gpu_online. Use the + only needs to download two directories: exp/tri5a/graph and exp/nnet2_online/nnet_a_gpu_online. Use the following commands to download the archives and extract them: - + \verbatim wget http://kaldi-asr.org/downloads/build/5/trunk/egs/fisher_english/s5/exp/nnet2_online/nnet_a_gpu_online/archive.tar.gz -O nnet_a_gpu_online.tar.gz wget http://kaldi-asr.org/downloads/build/2/sandbox/online/egs/fisher_english/s5/exp/tri5a/graph/archive.tar.gz -O graph.tar.gz mkdir -p nnet_a_gpu_online graph tar zxvf nnet_a_gpu_online.tar.gz -C nnet_a_gpu_online tar zxvf graph.tar.gz -C graph - \endverbatim - Here the archives are extracted to the local directory. We need to modify pathnames in the + \endverbatim + Here the archives are extracted to the local directory. We need to modify pathnames in the config files, which we can do as follows: \verbatim for x in nnet_a_gpu_online/conf/*conf; do @@ -280,7 +280,7 @@ done Next, choose a single wav file to decode. The reader can download a sample file by typing \verbatim wget http://www.signalogic.com/melp/EngSamples/Orig/ENG_M.wav - \endverbatim + \endverbatim This is a 8kHz-sampled wav file that we found online (unfortunately it is UK English, so the accuracy is not very good). It can be decoded with the following command: \verbatim @@ -296,17 +296,132 @@ done You can see the result in the logging output (although there are other ways to retrieve this). For us, the logging output was as follows: \verbatim -/home/dpovey/kaldi-online/src/online2bin/online2-wav-nnet2-latgen-faster --do-endpointing=false --online=false --config=nnet_a_gpu_online/conf/online_nnet2_decoding.conf --max-active=7000 --beam=15.0 --lattice-beam=6.0 --acoustic-scale=0.1 --word-symbol-table=graph/words.txt nnet_a_gpu_online/smbr_epoch2.mdl graph/HCLG.fst 'ark:echo utterance-id1 utterance-id1|' 'scp:echo utterance-id1 ENG_M.wav|' ark:/dev/null +/home/dpovey/kaldi-online/src/online2bin/online2-wav-nnet2-latgen-faster --do-endpointing=false --online=false --config=nnet_a_gpu_online/conf/online_nnet2_decoding.conf --max-active=7000 --beam=15.0 --lattice-beam=6.0 --acoustic-scale=0.1 --word-symbol-table=graph/words.txt nnet_a_gpu_online/smbr_epoch2.mdl graph/HCLG.fst 'ark:echo utterance-id1 utterance-id1|' 'scp:echo utterance-id1 ENG_M.wav|' ark:/dev/null LOG (online2-wav-nnet2-latgen-faster:ComputeDerivedVars():ivector-extractor.cc:180) Computing derived variables for iVector extractor LOG (online2-wav-nnet2-latgen-faster:ComputeDerivedVars():ivector-extractor.cc:201) Done. -utterance-id1 tons of who was on the way for races two miles and then in nineteen ninety to buy sodas sale the rate them all these to commemorate columbus is drawn into the new world five hundred years ago on the one to the moon is to promote the use of so the sales in space exploration +utterance-id1 tons of who was on the way for races two miles and then in nineteen ninety to buy sodas sale the rate them all these to commemorate columbus is drawn into the new world five hundred years ago on the one to the moon is to promote the use of so the sales in space exploration LOG (online2-wav-nnet2-latgen-faster:main():online2-wav-nnet2-latgen-faster.cc:253) Decoded utterance utterance-id1 LOG (online2-wav-nnet2-latgen-faster:Print():online-timing.cc:51) Timing stats: real-time factor for offline decoding was 1.62102 = 26.7482 seconds / 16.5009 seconds. LOG (online2-wav-nnet2-latgen-faster:main():online2-wav-nnet2-latgen-faster.cc:259) Decoded 1 utterances, 0 with errors. LOG (online2-wav-nnet2-latgen-faster:main():online2-wav-nnet2-latgen-faster.cc:261) Overall likelihood per frame was 0.230575 per frame over 1648 frames. \endverbatim +Note that for mismatched data, sometimes the iVector estimation can get confused and lead to bad results. +Something that we have found useful is to weight down the silence in the iVector estimation. +To do this you can set e.g. --ivector-silence-weighting.silence-weight=0.001; you need to set the silence +phones as appropriate, e.g. --ivector-silence-weighting.silence-phones=1:2:3:4 +(this should be a list of silence or noise phones in your phones.txt; you can experiment with +which ones to include). + +\subsection online_decoding_nnet2_lm Example for using your own language model with existing online-nnet2 models +Oftentimes users will have to use their own language model to improve the +recognition accuracy. In this section we will explain how to build a language +model with SRILM, and how to incorporate this language model to the existing +online-nnet2 models. + +We first have to build an ARPA format language model with SRILM. Note that SRILM +comes with a lot of training options, and we assume it's the user's +responsibility to figure out what is the best setting for their own application. +Suppose "train.txt" is our language model training corpus (e.g., training +data transcriptions), and "wordlist" is our vocabulary. Here we assume the +language model vocabulary is the same as the recognizer's vocabulary, i.e., it +only contains the words from data/lang/words.txt, except the epsilon symbol +"" and disambiguation symbol "#0". We will explain how we can use a +different vocabulary in the next section. We can build a 3gram Kneser-Ney +language model using the following SRILM command +\verbatim +ngram-count -text train.txt -order 3 -limit-vocab -vocab wordlist -unk \ + -map-unk "" -kndiscount -interpolate -lm srilm.o3g.kn.gz +\endverbatim +Now that we have the ARPA format language model trained, we have to compile it +into WFST format. Let's first define the following variables +\verbatim +lm=srilm.o3g.kn.gz # ARPA format LM you just built. +lang=data/lang # Old lang directory provided by the online-nnet2 models +lang_own=data/lang_own # New lang directory we are going to create, which contains the new language model +lang_own_tmp=data/local/lang_own_tmp/ # Temporary directory. +\endverbatim + +Given the above variables, we can compile an ARPA format language model into +WFST format using the following commands +\verbatim +mkdir -p $lang_own_tmp +mkdir -p $lang_own +cp -r $lang/* $lang_own +gunzip -c $lm | utils/find_arpa_oovs.pl $lang_own/words.txt \ + > $lang_own_tmp/oovs.txt || exit 1 +gunzip -c $lm | \ + grep -v ' ' | \ + grep -v ' ' | \ + grep -v ' ' | \ + arpa2fst - | fstprint | \ + utils/remove_oovs.pl $lang_own_tmp/oovs.txt | \ + utils/eps2disambig.pl | utils/s2eps.pl | \ + fstcompile --isymbols=$lang_own/words.txt --osymbols=$lang_own/words.txt \ + --keep_isymbols=false --keep_osymbols=false | \ + fstrmepsilon | fstarcsort --sort_type=ilabel > $lang_own/G.fst +utils/validate_lang.pl --skip-determinization-check $lang_own || exit 1; +\endverbatim + +Now, we can compile the decoding graph with the new language model, using the +following command +\verbatim +graph_own_dir=$model_dir/graph_own +utils/mkgraph.sh $lang_own $model_dir $graph_own_dir || exit 1; +\endverbatim +where $model_dir is the model directory which contains the model "final.mdl" +and the tree "tree". At this point, we can use $graph_own_dir/HCLG.fst to +replace the old HCLG.fst, which uses the language model we just built. + +\subsection online_decoding_nnet2_vocab Example for using a different vocabulary with existing online-nnet2 models +For most applications users will also have to change the recognizer's existing +vocabulary, for example, adding out-of-vocabulary words such as person names +to the existing vocabulary. In this section we will explain how this can be +done. + +We first have to create a new pronunciation lexicon, typically by adding more +words to the recognizer's existing pronunciation lexicon. The recognizer's +lexicon that we are going to modify is usually located at the $dict_dir/lexicon.txt, +where $dict_dir is the recognizer's dictionary directory, and is usually +data/local/dict. The new lexicon can be created manually by adding new lexical +entries to $dict_dir/lexicon.txt. If we do not have pronunciations for the new +words, we can use grapheme-to-phoneme (G2P) conversion to generate pronunciations +automatically. The commonly used G2P tools are Sequitur and Phonetisaurus, the +later is usually much faster. + +The second step is to create a dictionary directory for our new lexicon, which +contains the required files, for example, lexicon.txt, lexiconp.txt, etc. +Most likely if we don't change the lexicon's phone set, the old files such as +extra_questions.txt, nonsilence_phones.txt, optional_silence.txt, +silence_phones.txt can be re-used. For details of how to create those files, we +suggest the users follow the existing Kaldi scripts, for example this one: +egs/wsj/s5/local/wsj_prepare_dict.sh. The format of the dictionary directory is +described \ref data_prep_lang_creating "here". + +Now we can create a new lang directory with the updated lexicon. Suppose +$lang is the recognizer's old lang directory, $lang_own is the new lang +directory that we are going to create, $dict_own is the dictionary directory we +just created, and "" is the word symbol that represents +out-of-vocabulary words in the lexicon, we can generate the new lang directory +with the updated lexicon using the following command +\verbatim +utils/prepare_lang.sh \ + --phone-symbol-table $lang/phones.txt \ + $dict_own "" $lang_own_tmp $lang_own +\endverbatim +Make usre you use the option "--phone-symbol-table", which makes sure that +phones in your new lexicon will be compatible with the recognizer. + +The last step is of course to update the decoding graph, using the following +command +\verbatim +graph_own_dir=$model_dir/graph_own +utils/mkgraph.sh $lang_own $model_dir $graph_own_dir || exit 1; +\endverbatim +where $model_dir is the model directory which contains the model "final.mdl" +and the tree "tree". We now can use $graph_own_dir/HCLG.fst to replace the old +HCLG.fst. */ diff --git a/src/doc/queue.dox b/src/doc/queue.dox index cdf0cf63c40..72b2c44eab8 100644 --- a/src/doc/queue.dox +++ b/src/doc/queue.dox @@ -34,7 +34,7 @@ namespace kaldi { If you look at a top-level example script like egs/wsj/s5/run.sh, you'll see commands like \verbatim steps/train_sat.sh --cmd "$train_cmd" \ - 4200 40000 data/train_si284 data/lang exp/tri3b_ali_si284 exp/tri4a + 4200 40000 data/train_si284 data/lang exp/tri3b_ali_si284 exp/tri4a \endverbatim At the top of the run.sh script you'll see it sourcing a file called cmd.sh: \verbatim @@ -44,8 +44,8 @@ and in cmd.sh you'll see the following variable being set: \verbatim export train_cmd="queue.pl -l arch=*64" \endverbatim -You'll change this variable if you don't have GridEngine or if your queue is configured -differently from CLSP\@JHU. To run everything locally on a single machine you can +You'll change this variable if you don't have GridEngine or if your queue is configured +differently from CLSP\@JHU. To run everything locally on a single machine you can set export train_cmd=run.pl. In steps/train_sat.sh the varible cmd is set to the argument @@ -87,10 +87,10 @@ In this case, the command that actually gets executed will be something like: \verbatim echo "hello world number JOB" | head -n 1 > output.JOB \endverbatim -If you want to see what's actually getting executed, you can look in a file like +If you want to see what's actually getting executed, you can look in a file like foo.1.log, where you'll see the following: \verbatim -# echo "hello world number 1" | head -n 1 > output.1 +# echo "hello world number 1" | head -n 1 > output.1 # Started at Sat Jan 3 17:44:20 PST 2015 # # Accounting: time=0 threads=1 @@ -114,15 +114,15 @@ and what we are about to say also holds for run.pl, ssh.pl and may include some or all of the following:
    -
  • A job range specifier (e.g. JOB=1:10). The name is uppercase by convention only, and may include underscores. +
  • A job range specifier (e.g. JOB=1:10). The name is uppercase by convention only, and may include underscores. The starting index must be 1 or more; this is a GridEngine limitation. -
  • Anything that looks as if it would be accepted by GridEngine as an option to qsub. +
  • Anything that looks as if it would be accepted by GridEngine as an option to qsub. For example, -l arch=*64*, or -l mem_free=6G,ram_free=6G, or -pe smp 6. For compatibility, scripts other than queue.pl will ignore such options.
  • New-style options like --mem 10G (see below).
- is just a filename, which for array jobs must contain the identifier of + is just a filename, which for array jobs must contain the identifier of the array (e.g. exp/foo/log/process_data.JOB.log). can basically be anything, including symbols that would @@ -144,7 +144,7 @@ string itself contains single quotes then it uses double quotes instead. This usually does what we want. The PATH variable from the shell that you executed queue.pl from will be passed through to the scripts that get executed, and just to be certain you get everything you need, -the file ./path.sh will also be sourced. The commands will be executed +the file ./path.sh will also be sourced. The commands will be executed with bash. \subsection parallelization_common_new New-style options (unified interface) @@ -177,7 +177,7 @@ file specifies how to convert the "new-style" options into options that GridEngine or similar software can interpret. The following example show the behavior that the default config file specifies: - + (general case) @@ -187,7 +187,7 @@ behavior that the default config file specifies:
New-style option Converted form (for GridEngine) Comment
New-style option Converted form (for GridEngine) Comment
--mem 10G -l mem_free=10G,ram_free=10G
--max-jobs-run 10 -tc 10 (We use this for jobs that cause too much I/O).
--num-threads 6 -pe smp 6
It's also possible to add extra options with this general format, i.e. options that look like ---foo-bar and take one argument. The default configuration tabulated above works for the CLSP grid +--foo-bar and take one argument. The default configuration tabulated above works for the CLSP grid but may not work everywhere, because GridEngine is very configurable. Thefore you may have to create a config file conf/queue.conf and edit it to work with your grid. The following configuration file is the one that queue.pl defaults to if conf/queue.conf @@ -209,10 +209,10 @@ The line beginning with command specifies the unchanging part of the command line, and you can modify this to get it to use grid software other than GridEngine, or to specify options that you always want. The lines beginning with -option specify how to transform the input options such as --mem. +option specify how to transform the input options such as --mem. Lines beginning with something like "option mem=*" handle the general case (the $0 gets replaced with the actual argument to the option), while -lines like "option gpu=0" allow you to specify special behavior for special +lines like "option gpu=0" allow you to specify special behavior for special cases of the argument, so in this case the option --gpu 0 is configured to produce no extra options to qsub at all. The line "default gpu=0" specifies that if you don't give the --gpu option at all, queue.pl should act like @@ -224,10 +224,10 @@ configured with a line: "option gpu=0 -q all.q", so there was a time wh The mapping from what the config-file specifies to what appears on the command-line of qsub sometimes has to be tweaked slightly in the perl code: for instance, we made it -so that the --max-jobs-run option is ignored for non-array jobs. +so that the --max-jobs-run option is ignored for non-array jobs. + + \subsection parallelization_common_new_example Example of configuring grid software with new-style options - \subsection parallelization_common_new_example Example of configuring grid - software with new-style options We'd like to give an example of how the config file can be used in a real situation. We had a problem where, due to a bug in an outdated version of the @@ -270,10 +270,10 @@ parallelization scripts. line. The scripts that we ask qsub to run also make use of the variable $SGE_TASK_ID, which SGE sets to the job index for array jobs. Our plan is to extend the config-file mechanism as necessary to accommodate whatever changes are needed to support - other grid software, within reason. + other grid software, within reason. Since we have explained the behavior of queue.pl at length above, we aren't going - to provide many further details in this section, but please see below the section + to provide many further details in this section, but please see below the section \ref parallelization_gridengine. \subsection parallelization_specific_run Parallelization using run.pl @@ -300,12 +300,12 @@ parallelization scripts. ssh.pl is a poor man's queue.pl, for use in case you have a small cluster of several machines but don't want the trouble of setting - up GridEngine. Like run.pl, it doesn't attempt to keep track of + up GridEngine. Like run.pl, it doesn't attempt to keep track of CPUs or memory; it works like run.pl except that it distributes the - jobs across multiple machines. + jobs across multiple machines. You have to create a file .queue/machines (where .queue is a subdirectory of the directory you are running the script from), - where each line contains the name of a machine. It needs to be possible to ssh to each + where each line contains the name of a machine. It needs to be possible to ssh to each of these machines without a password, i.e. you have to set up your ssh keys. @@ -355,7 +355,7 @@ parallelization scripts. To install GridEngine on the master, you'll run (on your chosen master node): \verbatim sudo apt-get install gridengine-master gridengine-client -\endverbatim +\endverbatim Select "yes" for automatic configuration. It will ask you for the "cell name", which you can leave as "default", and it will ask for the name of the "master", which you should set to the hostname of @@ -366,7 +366,7 @@ parallelization scripts. sometimes be traced to this. Also be aware that doing "apt-get remove" of these packages and reinstalling them won't give you a blank slate because Debian sometimes remembers your selections; this can be a pain. - + It will make your life easier if you add yourself as manager, so do: \verbatim sudo qconf -am @@ -377,9 +377,9 @@ parallelization scripts. To install GridEngine on the normal nodes, you'll run \verbatim sudo apt-get install gridengine-client gridengine-exec -\endverbatim +\endverbatim The "cell name" should be left as "default", and the "master" should be the name of - the master node that you previously installed. + the master node that you previously installed. You can run this on the master too if the master is to run jobs also. Typing qstat and qhost -q will let you know whether things are working. @@ -399,13 +399,13 @@ instance-1.c.analytical-rig-638.internal lx26-amd64 1 0.07 3.6G 133.9M doesn't like it when these things are inconsistent. If you need to change the name of the master from what you told the installer, you may be able to do so by editing the file \verbatim -/var/lib/gridengine/default/common/act_qmaster +/var/lib/gridengine/default/common/act_qmaster \endverbatim (at least, this is where it's located in Debian Wheezy). \subsection parallelization_gridengine_configuring Configuring GridEngine - First let's make sure that a queue is defined. GridEngine doesn't define any queues by + First let's make sure that a queue is defined. GridEngine doesn't define any queues by default. We'll set up a queue called all.q. Make sure the shell variable EDITOR is set to your favorite shell (e.g. vim or emacs), and type as follows; and this should work from master or client. @@ -438,9 +438,9 @@ change root to an email address where you want to receive notifications if things go wrong. Be advised that due to anti-spam measures, sending emails from the cloud is painful from EC2 and close to impossible from Google's cloud offering, so it may be best just to leave this field the -way it is and make do without email notifications. You could also edit the file so that it says +way it is and make do without email notifications. You could also edit the file so that it says \verbatim - flush_time=00:00:10 + flush_time=00:00:10 \endverbatim (the default is 00:00:15), which will give a slightly faster turnaround time for submitting jobs. @@ -449,23 +449,23 @@ your jobs, and these can be viewed using qconf -sc. Modify them using qconf -mc. Modify the mem_free line to change the default memory requirement from 0 to 1G, i.e.: \verbatim -#name shortcut type relop requestable consumable default urgency +#name shortcut type relop requestable consumable default urgency #------------------------------------------------------------------------------------------ - + mem_free mf MEMORY <= YES NO 1G 0 \endverbatim and also add the following two new lines; it doesn't matter where in the file you add them. \verbatim -#name shortcut type relop requestable consumable default urgency +#name shortcut type relop requestable consumable default urgency #------------------------------------------------------------------------------------------ - + gpu g INT <= YES YES 0 10000 ram_free ram_free MEMORY <= YES JOB 1G 0 \endverbatim You'll only need the "gpu" field if you add GPUs to your grid; the ram_free is a field that we find useful in managing the memory of the machines, as the inbuilt field mem_free doesn't seem to work quite right for our purposes. Later on - when we add hosts to the grid, we'll use the command qconf -me to + when we add hosts to the grid, we'll use the command qconf -me to edit the complex_values field to read something like: \verbatim complex_values ram_free=112G,gpu=2 @@ -474,7 +474,7 @@ ram_free ram_free MEMORY <= YES JOB 1G a job that needs 10G of memory, we'll specify -l mem_free=10G,ram_free=10G as an option to qsub; the mem_free requirement makes sure the machine has that much free memory at the time the job starts, and the ram_free requirement makes sure we - don't submit a lot of jobs requiring a lot of memory, all to the same host. + don't submit a lot of jobs requiring a lot of memory, all to the same host. We tried, as an alternative to adding the ram_free resource, using qconf -mc to edit the consumable field of the inbuilt mem_free resource to say YES, to make GridEngine keep track of memory requests; but this did not @@ -512,7 +512,7 @@ pe_list make smp \verbatim prolog /var/lib/gridengine/default/common/prolog.sh \endverbatim - (the default was NONE), + (the default was NONE), and the script /var/lib/gridengine/default/common/prolog.sh, which we copied to that location on each individual node in the cluster, reads as follows. Its only purpose is to wait a short time if the job script can't be @@ -531,7 +531,7 @@ function test_ok { if [ ! -z "$SGE_STDERR_PATH" ]; then if [ ! -d "`dirname $SGE_STDERR_PATH`" ]; then echo "$0: warning: no such directory $JOB_SCRIPT, will wait." 1>&2 - return 1; + return 1; fi fi return 0; @@ -558,7 +558,7 @@ We also edited the queue with qconf -mq all.q to change rerun TRUE \endverbatim This means that when jobs fail, they get in a status that shows up in the output of -qstat as Eqw, with the E indicating error, and you can ask the +qstat as Eqw, with the E indicating error, and you can ask the queue to reschedule them by clearing the error status with qmod -cj (or if you don't want to rerun them, you can delete them with qmod -dj ). Setting the queue to allow reruns can avoid the hassle of rerunning scripts from the @@ -582,8 +582,8 @@ rlogin_daemon builtin rsh_command builtin rsh_daemon builtin \endverbatim -This was to solve a problem whose nature we can no longer recall, but it's something you might want to try it if -commands like qlogin and qrsh don't work. +This was to solve a problem whose nature we can no longer recall, but it's something you might want to try it if +commands like qlogin and qrsh don't work. \subsection parallelization_gridengine_configuring_adding Configuring GridEngine (adding nodes) @@ -591,7 +591,7 @@ commands like qlogin and qrsh don't work. As mentioned above, you can install GridEngine on nodes by doing \verbatim sudo apt-get install gridengine-client gridengine-exec -\endverbatim +\endverbatim and you need to specify default as the cluster name, and the name of your master node as the master (probably using the FQDN of the master is safest here, but if you are on a local network, just the last part of the name may also work). @@ -613,7 +613,7 @@ commands like qlogin and qrsh don't work. \verbatim complex_values ram_free=112G,gpu=1 \endverbatim -You'll notice is a slight asymmetry between the commands qconf -sh +You'll notice is a slight asymmetry between the commands qconf -sh and qconf -ss on the one hand, and qconf -sel on the other. The "l" in the latter command means show the list. The difference is that administrative and submit host lists are just lists of hosts, whereas @@ -623,7 +623,7 @@ You can view the information about a particular host with qconf -se qconf -ae , and modify with qconf -me . This is a general pattern in GridEngine: for things like queues that have a bunch of information in them, you can show the full list -by typing a command ending in "l" like qconf -sql, and the corresponding "add" +by typing a command ending in "l" like qconf -sql, and the corresponding "add" ("a") and "modify" ("m") commands accept arguments. It's not enough to tell GridEngine that a node is an execution host; you have to also add it to the queue, @@ -647,7 +647,7 @@ nodes with that number of slots you can save yourself some time and avoid adding name to the slots field. There is an alternative way to set up the hostlist field. GridEngine has the concept of host groups, so you could do qconf -ahgrp \@allhosts to add a group of hosts, and edit it using -qconf -mhgrp \@allhosts to add your new nodes. The configuration of +qconf -mhgrp \@allhosts to add your new nodes. The configuration of all.q could then just read: \verbatim hostlist @allhosts @@ -663,12 +663,12 @@ HOSTNAME ARCH NCPU LOAD MEMTOT MEMUSE SWAPTO SWAPUS ------------------------------------------------------------------------------- global - - - - - - - a01.clsp.jhu.edu lx26-amd64 24 12.46 126.2G 11.3G 86.6G 213.7M - all.q BIP 0/6/20 + all.q BIP 0/6/20 a02.clsp.jhu.edu lx26-amd64 24 16.84 126.2G 12.4G 51.3G 164.5M - all.q BIP 0/18/20 + all.q BIP 0/18/20 \endverbatim -If you see the letter "E" in the place where the example above shows "BIP", +If you see the letter "E" in the place where the example above shows "BIP", it means the node is in the error state. Other letters you don't want to see in that position are "a" for alarm (a generic indicator of badness) and "u" for unreachable. "d" means a node has been disabled by an administrator. @@ -694,8 +694,8 @@ You can view all jobs from all users by running \verbatim qstat -u '*' \endverbatim - -\section parallelization_grid_stable Keeping your grid stable + +\section parallelization_grid_stable Keeping your grid stable In this section we have some general notes on how to ensure stability in a compute cluster of the kind useful for Kaldi. @@ -754,7 +754,7 @@ We show it as if we're grepping it from /etc/fstab; this isn't actually how we d # grep a05 /etc/fstab a05:/mnt/data /export/a05 nfs rw,vers=3,rsize=8192,wsize=8192,acdirmin=5,acdirmax=8,hard,proto=tcp 0 0 \endverbatim -The option "vers=3" means we use NFS version 3, which is stateless. We tried using version 4, +The option "vers=3" means we use NFS version 3, which is stateless. We tried using version 4, a supposedly more advanced "stateful" protocol, but we got a lot of crashes. The acdirmin=5 and acdirmin=8 options are the minimum and maximum times that NFS @@ -762,7 +762,7 @@ waits before re-reading cached directory information; the defaults are 30 and 60 This is important for Kaldi scripts, because the files that we execute on GridEngine are written only shortly before we run the scripts, so with default NFS options they may not yet be visible on the execution host at the time they are needed. Above we showed our script /var/lib/gridengine/default/common/prolog.sh -which waits up to 14 seconds for the script to appear. It's significant that 14 > 8, i.e. that the +which waits up to 14 seconds for the script to appear. It's significant that 14 > 8, i.e. that the number of seconds the prolog script will wait for is greater than the maximum directory caching period for NFS. The hard option is also important; it means that if the server is busy, the client will wait @@ -813,7 +813,7 @@ and manage a compute grid. In CLSP we use a lot of NFS hosts, not just one or two; in fact, most of our nodes also export data via NFS. If you do this you should use our mem-killer.pl or a similar script, or you will get instability due -to memory exhaustion when users make mistakes. +to memory exhaustion when users make mistakes. Having a large number of file servers is a particularly good idea for queues that are shared by many people, because it's inevitable that people will overload file servers, and if there are diff --git a/src/doc/tree_externals.dox b/src/doc/tree_externals.dox index ee2bc11d8b9..df9f96e8430 100644 --- a/src/doc/tree_externals.dox +++ b/src/doc/tree_externals.dox @@ -32,13 +32,13 @@ namespace kaldi { The basic algorithm that is being implemented is a top-down greedy splitting, where we have a number of ways we can split the data by asking about, say, the left phone, the right - phone, the central phone, the state we're in, and so on. + phone, the central phone, the state we're in, and so on. The algorithm we implement is similar to the standard algorithm, see for example the paper "Tree-based State Tying for High Accuracy Acoustic Modeling" by Young, Odell and Woodland. In this algorithm, we split the data up by asking the locally optimal question, i.e. the one that gives the most likelihood increase, supposing - we model the data on each side of the split by a single Gaussian. - Differences from standard implementations include added flexibility + we model the data on each side of the split by a single Gaussian. + Differences from standard implementations include added flexibility about how to configure the tree roots; the ability to ask questions about the HMM-state and the central phone; and the fact that by default in the Kaldi scripts, the questions are automatically generated by a top-down binary clustering of the data, which means @@ -50,7 +50,7 @@ namespace kaldi { be the tree roots. For how to configure it using the standard scripts, see \ref data_prep. In practice we generally let each tree-root correspond to a "real phone", meaning that we group together all word-position-dependent, tone-dependent or stress-dependent versions of - each phone into one group that becomes a tree root. + each phone into one group that becomes a tree root. The rest of this page mostly gives details at the code level of what is happening. @@ -74,7 +74,7 @@ below summarizes these values: N is the width of the context window and P is the identity of the designated -"central phone". Normally P is exactly in the middle of the window +"central phone". Normally P is exactly in the middle of the window (hence the name "central-position"); for example, with N=3, we would normally have P=1, but you are free to choose any value from 0 to N-1; for instance, P=2 and N=3 means two phones of left context and no right context at all. @@ -82,32 +82,32 @@ In the code, when we talk about the "central phone" we always mean the P'th phone which may or may not actually be the central phone of the context window. A vector of integers representing a typical triphone context window might be: -\code -// probably not valid C++ +\code +// probably not valid C++ vector ctx_window = { 12, 15, 21 }; \endcode -Assuming N=3 and P=1, this would represent phone 15 with +Assuming N=3 and P=1, this would represent phone 15 with a right context of 21 and a left context of 12. The way we handle end effects is using zero (which is not a valid phone because it's reserved in OpenFst for the epsilon meaning "no symbol"), so for instance: -\code +\code vector ctx_window = { 12, 15, 0 }; \endcode means phone 15 with a left-context of 12 and no right-context because it's the end of the utterance. At the end of utterance in particular, the use of zero this way may be a little unexpected because the last "phone" is actually the -subsequential symbol "$" (see \ref graph_c), but for the convenience +subsequential symbol "$" (see \ref graph_c), but for the convenience of the decision-tree code we don't put the subsequential symbol in these context windows, we put zero. Note that if we had N=3 and P=2, the above context window would be invalid because its P'th element would be zero which is not a real phone; also of course, -if we had a tree with N=1, neither of the windows above would be valid because they +if we had a tree with N=1, neither of the windows above would be valid because they are the wrong size. In the monophone case, we would have a window like: -\code +\code vector ctx_window = { 15 }; \endcode so monophone systems are just treated as a special case of context-dependent -systems, with a window size N of 1 and a tree that doesn't do anything very +systems, with a window size N of 1 and a tree that doesn't do anything very interesting. @@ -126,28 +126,28 @@ TransitionModel object and an AmDiagGmm object). If the program gmm-init-mono receives an option called --shared-phones, it will share the pdfs between specified sets of phones; otherwise it makes all the phones separate. -After training a monophone system starting from a flat start, we take +After training a monophone system starting from a flat start, we take the monophone alignments -and use the function AccumulateTreeStats() (called from \ref acc-tree-stats.cc +and use the function AccumulateTreeStats() (called from \ref acc-tree-stats.cc "acc-tree-stats") to accumulate statistics for training the tree. This program is not limited to reading in monophone alignments; it works from context-dependent alignments too so we can build trees based on e.g. triphone alignments. -The statistics for tree building are written to disk as the type \ref BuildTreeStatsType -(see \ref treei_stats). +The statistics for tree building are written to disk as the type \ref BuildTreeStatsType +(see \ref treei_stats). The function AccumulateTreeStats() takes the values N and P, as explained in the previous section; the command-line programs will set these by default to 3 and 1 respectively, but this can be overridden using the --context-width -and --central-position options. The program \ref acc-tree-stats.cc +and --central-position options. The program \ref acc-tree-stats.cc "acc-tree-stats" takes a list of context-independent phones (e.g. silence), but this is not required even if there are context-independent phones; it is just -a mechanism to reduce the size of the statistics. +a mechanism to reduce the size of the statistics. For context-independent hones, the program will accumulate the corresponding statistics without the keys corresponding to the left and right phones defined (c.f. \ref treei_event_map). When the statistics have been -accumulated we use the program \ref build-tree.cc "build-tree" to -build the tree. This outputs the tree. +accumulated we use the program \ref build-tree.cc "build-tree" to +build the tree. This outputs the tree. The program \ref build-tree.cc "build-tree" requires three things: - The statistics (of type BuildTreeStatsType) - The questions config (of type Questions) @@ -160,21 +160,32 @@ scripts, these are automatically obtained from tree-building statistics by the program cluster-phones. The roots file specifies sets of phones that are goint to have shared roots in the decision-tree clustering process, and says for each phone set the following two things: - - "shared" or "not-shared" says whether or not there should be separate - roots for each of the \ref pdf_class "pdf-classes" (i.e. HMM-states, - in the typical case), or if the roots - should be shared. If we are going to be splitting (the "split" option - below) we enforce that the roots should be shared. + + - "shared" or "not-shared" says whether or not there should be separate roots + for each of the \ref pdf_class "pdf-classes" (i.e. HMM-states, in the + typical case), or if the roots should be shared. If it says "shared" there + will be a single tree-root for all HMM states (e.g. all three states, in a + normal topology) ; if "not-shared" there would be (e.g.) three tree-roots, + one for each pdf-class. + - "split" or "not-split" says whether or not the decision tree splitting should actually be done for the roots in question (for silence, we - typically don't split). + typically don't split). If the line says "split" (the normal case) then + we do the decision tree splitting. If it says "not-split" then no splitting + is done and the roots are left un-split. -Be careful because the notation is a bit tricky. The "shared" on the line of -the roots file is about whether we will share all the 3 HMM-states of the phone -in a single tree root. But we will always share together the roots of all the phones that -appear on a single lines of the roots file. This is not configurable via these -strings because if you don't want to share them, you can just put them on -separate lines of the roots file. + +The following will clarify some aspects of how this works: + + - If we say "shared split", then + even though there is one root node for all three HMM-states, the different + HMM states can still get different leaves because the tree can ask questions + about the pdf-class as well as about phonetic context. + + - We always share together the roots of all the phones that appear on a single + lines of the roots file. This is not configurable via these strings because + if you don't want to share the phones' roots, you can just put them on + separate lines of the roots file. Below is an example of a roots file; this assumes that phone 1 is silence and all the other phones have separate roots. @@ -185,14 +196,14 @@ shared split 3 ... shared split 28 \endverbatim -Having multiple phones on the same line is most useful when we have things like position and +Having multiple phones on the same line is most useful when we have things like position and stress-dependent phones; in this case each "real" phone would correspond to a set of integer phone ids. In that case we share the roots for all versions of a particular underlying phone. Below is an example of a roots file -for Wall Street Journal, from the egs/wsj/s5 scripts (this is in text, not integer form; +for Wall Street Journal, from the egs/wsj/s5 scripts (this is in text, not integer form; it would have to be converted to integer form before being read by Kalid): \verbatim -not-shared not-split SIL SIL_B SIL_E SIL_I SIL_S SPN SPN_B SPN_E SPN_I SPN_S NSN NSN_B NSN_E NSN_I NSN_S +not-shared not-split SIL SIL_B SIL_E SIL_I SIL_S SPN SPN_B SPN_E SPN_I SPN_S NSN NSN_B NSN_E NSN_I NSN_S shared split AA_B AA_E AA_I AA_S AA0_B AA0_E AA0_I AA0_S AA1_B AA1_E AA1_I AA1_S AA2_B AA2_E AA2_I AA2_S shared split AE_B AE_E AE_I AE_S AE0_B AE0_E AE0_I AE0_S AE1_B AE1_E AE1_I AE1_S AE2_B AE2_E AE2_I AE2_S shared split AH_B AH_E AH_I AH_S AH0_B AH0_E AH0_I AH0_S AH1_B AH1_E AH1_I AH1_S AH2_B AH2_E AH2_I AH2_S @@ -207,7 +218,7 @@ When creating the roots file, you should ensure that at least one phone on each For instance, in this case, if the phone AY was seen in at least some combination of stress and word-position, we would be OK. -In this example, we have various word-position-dependent variants of silence and so on. +In this example, we have various word-position-dependent variants of silence and so on. In this example they will all share their pdf's because they are on the same line and are "not-split"-- but they may have different transition parameters. In fact, most of these variants of silence would never be used as silence never appears inside words; this is for @@ -224,13 +235,13 @@ tree to another using the program \ref convert-ali.cc "convert-ali". pdf-id, and these are contiguous (typically there are several thousand of these in an LVCSR system). They are originally assigned when the tree is first built. Depending how the tree is built, it may or may not be possible to say, for each pdf-id, which phone - it corresponds to. + it corresponds to. \section tree_ctxdep Context dependency objects The ContextDependencyInterface object is a virtual base-class for the tree that specifies how it interacts with the graph-building code. This - interface contains only four functions: + interface contains only four functions: - \ref ContextDependencyInterface::ContextWidth() "ContextWidth()" returns the value of N (context-width) that the tree requires. - \ref ContextDependencyInterface::CentralPosition() "CentralPosition()" returns @@ -264,8 +275,8 @@ else \endcode The only class that currently inherits from ContextDependencyInterface -is the class ContextDependency, which has marginally richer interface; -the only important addition is the function \ref ContextDependency::GetPdfInfo +is the class ContextDependency, which has marginally richer interface; +the only important addition is the function \ref ContextDependency::GetPdfInfo "GetPdfInfo" which is used by the TransitionModel class to work out which phones a particular pdf can possibly correspond to (this function could be emulated given only the interface of ContextDependencyInterface, by @@ -274,7 +285,7 @@ enumerating all contexts). The ContextDependency object is actually a fairly thin wrapper for the EventMap object; see \ref tree_internals. We wanted to hide the actual implementation of the tree as much as possible to make it -easy to refactor the code later if needed. +easy to refactor the code later if needed. \section tree_example An example of a decision tree @@ -309,18 +320,18 @@ Below is a kind of quasi-BNF notation that explains the tree-file format. In the example below, the top-level EventMap of the tree is a SplitEventMap (SE) that splits on key 1, which is the central phone. In square brackets are a contiguous range of phone-ids. As it happens, these don't represent a question, but are just a way of -splitting on phones so we can get to the "real" decision trees which are per phone. +splitting on phones so we can get to the "real" decision trees which are per phone. The issue is that this tree was built with "shared roots", so there are various phone-ids, corresponding to different word-position-and-stress-marked versions of the same phone, that share the root. We can't use a TableEventMap (TE) at the top level of the tree, or we'd have to repeat each decision tree several times (since the EventMap is a pure -tree, not a general graph, it has no mechanism for pointers to be "shared"). -The next few instances of the "SE" label are also part of this "quasi-tree" which +tree, not a general graph, it has no mechanism for pointers to be "shared"). +The next few instances of the "SE" label are also part of this "quasi-tree" which is initially splitting on the central phone (as we go down this file we are going deeper into the tree; notice that the braces "{" are opening but not yet closing). Then we have the string "TE -1 5 ( CE 0 CE 1 CE 2 CE 3 CE 4 )", which represents splitting with a TableEventMap -on the pdf-class "-1" (effectively, the HMM-position), and returning values 0 through 4. +on the pdf-class "-1" (effectively, the HMM-position), and returning values 0 through 4. The values represent the five pdf-ids for the silence and noise phones SIL, NSN and SPN; in our setup, the pdfs are shared between these three non-speech phones (only the transition matrix is specific to each non-speech phone). @@ -332,8 +343,8 @@ various versions of the phone AA; and question is asking whether the pdf-class ( has value 0 (i.e. the leftmost HMM-state). Assuming the answer is "yes", the next question is "SE 2 [ 220 221 222 223 ]", which is asking whether the phone to the right is one of various forms of the phone "M" (a rather unintuitive question to ask, since we're -in the leftmost HMM-state); if yes, we ask "SE 0 [ 104 105 106 107... 286 287 ]" which is -a question about the phone to the right; if yes, then the pdf-id is 5 ("CE 5") and if +in the leftmost HMM-state); if yes, we ask "SE 0 [ 104 105 106 107... 286 287 ]" which is +a question about the phone to the right; if yes, then the pdf-id is 5 ("CE 5") and if no, 696 ("CE 696"). \verbatim s3# copy-tree --binary=false exp/tri1/tree - 2>/dev/null | head -100 @@ -366,8 +377,8 @@ SE 2 [ 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 36 37 38 39 40 41 42 43 44 45 4 \endverbatim Below is a simpler example: the monophone tree from the Resource Management -recipe. The top-level EventMap is a TableEventMap ("TE 0 49 ..."). -The key "0" is the phone-position of zero which represents the central (and only) phone +recipe. The top-level EventMap is a TableEventMap ("TE 0 49 ..."). +The key "0" is the phone-position of zero which represents the central (and only) phone since the context width (N) is 1. The number of entries in the table is 49 (in this case, the number of phones plus one). The first EventMap in the table (index zero) is NULL, because there is no phone with @@ -375,11 +386,11 @@ index zero. The next one is a TableEventMap with three elements, corresponding to the three HMM-states (technically, pdf-classes) of the first phone: "TE -1 3 ( CE 0 CE 1 CE 2 )". \verbatim s3# copy-tree --binary=false exp/mono/tree - 2>/dev/null| head -5 -ContextDependency 1 0 ToPdf TE 0 49 ( NULL TE -1 3 ( CE 0 CE 1 CE 2 ) -TE -1 3 ( CE 3 CE 4 CE 5 ) -TE -1 3 ( CE 6 CE 7 CE 8 ) -TE -1 3 ( CE 9 CE 10 CE 11 ) -TE -1 3 ( CE 12 CE 13 CE 14 ) +ContextDependency 1 0 ToPdf TE 0 49 ( NULL TE -1 3 ( CE 0 CE 1 CE 2 ) +TE -1 3 ( CE 3 CE 4 CE 5 ) +TE -1 3 ( CE 6 CE 7 CE 8 ) +TE -1 3 ( CE 9 CE 10 CE 11 ) +TE -1 3 ( CE 12 CE 13 CE 14 ) \endverbatim @@ -391,8 +402,8 @@ disambiguation symbols and possibly epsilon symbols). In the graph, as always, these are represented by integer labels. We use an object that, in code and in filenames, is generally called ilabel_info. The ilabel_info object 4has a strong connection to the \ref fst::ContextFst "ContextFst" objects, see \ref graph_context. -As with many other Kaldi types, ilabel_info is a generic (STL) type but -we use a consistent variable name +As with many other Kaldi types, ilabel_info is a generic (STL) type but +we use a consistent variable name to make it identifiable. It is of the following type: \code std::vector > ilabel_info; @@ -402,7 +413,7 @@ input label the corresponding phonetic context window (see above, \ref tree_window). For example, suppose symbol 1500 is phone 30 with a right-context of 12 and a left-context of 4, we would have -\code +\code // not valid C++ ilabel_info[1500] == { 4, 30, 12 }; \endcode @@ -410,14 +421,14 @@ In the monophone case, we would have things like: \code ilabel_info[30] == { 28 }; \endcode -There are special cases to deal with disambiguation symbols (see -\ref graph_disambig or the +There are special cases to deal with disambiguation symbols (see +\ref graph_disambig or the Springer Handbook paper referenced above for an explanation of what these are). If an ilabel_info entry corresponds to a disambiguation symbol, we put in it the negative of the symbol-table entry of the disambiguation symbol (note that this is not the same as the number of the printed form -of the disambiguation symbol as in #0, #1, #2 etc., it is the number -corresponding to it in a symbol-table file, which in our current scripts is +of the disambiguation symbol as in #0, #1, #2 etc., it is the number +corresponding to it in a symbol-table file, which in our current scripts is called phones_disambig.txt). For example, \code ilabel_info[5] == { -42 }; @@ -428,7 +439,7 @@ so the programs that interpret the ilabel_info object don't need to be given a list of disambiguation symbols in order to be able to distinguish them from real phones in the monophone case. There are two additional special cases: we have -\code +\code ilabel_info[0] == { }; // epsilon ilabel_info[1] == { 0 }; // disambig symbol #-1; // we use symbol 1, but don't consider this hardwired. diff --git a/src/doc/tutorial.dox b/src/doc/tutorial.dox index ea94ee93e50..2e47624abeb 100644 --- a/src/doc/tutorial.dox +++ b/src/doc/tutorial.dox @@ -23,7 +23,7 @@ - \subpage tutorial_prereqs "Prerequisites" - \subpage tutorial_setup "Getting started" (15 minutes) - \subpage tutorial_git "Version control with Git" (5 minutes) - - \subpage tutorial_looking "Overview of the distribution" (25 minutes) + - \subpage tutorial_looking "Overview of the distribution" (20 minutes) - \subpage tutorial_running "Running the example scripts" (40 minutes) - \subpage tutorial_code "Reading and modifying the code" (30 minutes) diff --git a/src/doc/tutorial_git.dox b/src/doc/tutorial_git.dox index 63676df86c1..7612a1b1e4a 100644 --- a/src/doc/tutorial_git.dox +++ b/src/doc/tutorial_git.dox @@ -252,6 +252,15 @@ GitHub will automatically update the pull request web page. Then reply e. g. "Done" under the comments that you received, so that they know you followed up on their comments. +If you are creating a pull request only for a review of an incomplete piece of +work, which makes sense and is encouraged if you want early feedback on a +proposed feature, begin the title of your pull request with the prefix +WIP:. This will tell the maintainers not to merge the pull request +yet. When you push more commits to your branch, they automatically show in the +pull request. When you think the work is complete, edit the pull request title +to remove the \c WIP prefix and then add a comment to this effect, so that the +maintainers are notified. + \ref tutorial "Up: Kaldi tutorial"
\ref tutorial_setup "Previous: Getting started"
\ref tutorial_looking "Next: Overview of the distribution"
diff --git a/src/doc/tutorial_looking.dox b/src/doc/tutorial_looking.dox index 6d525df93e9..420abfc9bce 100644 --- a/src/doc/tutorial_looking.dox +++ b/src/doc/tutorial_looking.dox @@ -35,11 +35,8 @@ The directory "tools/' is where we install things that Kaldi depends on in various ways. Change directory to tools/ and list it. You will see various - files and subdirectories, mostly things that have been installed by the script - install.sh. Look very quickly at the files install.sh and INSTALL. These files - contain similar material since they cover the same steps, but INSTALL is the - manual version of the instructions and install.sh is the automatic version. The - manual version may be helpful as a fall-back plan in case you have installation problems. + files and subdirectories, mostly things that have been installed by the make command. + Look very quickly at the file INSTALL. This file gives instructions on how to install the tools. The most important subdirectory is the one for OpenFst. cd to openfst/. This is a soft link to the actual directory which has a version number. List the openfst directory. @@ -142,16 +139,16 @@ include ../kaldi.mk Look at the file ../kaldi.mk. It will contain some rules related to valgrind (for memory debugging), and then some system-specific configuration in the form of variables such as CXXFLAGS. - See if there are any -O options (e.g. -O0). You might want to remove the flags - -O0 and -DKALDI_PARANOID before running big experiments, as they slow things - down (we enable them by default for better debugging). + See if there are any -O options (e.g. -O0). The flags + -O0 and -DKALDI_PARANOID are disabled by default as they slow things + down (you might want to enable them for better debugging). Look again at base/Makefile. The statement "all:" at the top tells Make that "all" is the top-level target (because there are targets in kaldi.mk and we don't want these to become the top-level target). Because the dependencies of "all" depend on variables defined later, we have another - statement down below in which we define what "all" depends on. Look for - it. Several other targets are defined, starting with "clean". Look for - them in the Makefile. To make "clean" you would type "make clean". + statement (the target is defined in default_rules.mk) in which we define what "all" depends on. + Look for it. Several other targets are defined, starting with "clean". + Look for them. To make "clean" you would type "make clean". The target .valgrind is not something you would invoke from the command line; you would type "make valgrind" (the target is defined in kaldi.mk). Invoke all of these targets, i.e. type "make clean" and the same for the others, diff --git a/src/doc/tutorial_running.dox b/src/doc/tutorial_running.dox index a9f782b9fc2..1f3cb4ee82a 100644 --- a/src/doc/tutorial_running.dox +++ b/src/doc/tutorial_running.dox @@ -148,7 +148,7 @@ Look at the files with suffix .csl (in data/lang/phones). These are colon-separa Look at phones.txt (in data/lang/). This file is a phone symbol table that also handles the "disambiguation symbols" used in the standard FST recipe. These symbols are conventionally called \#1, \#2 and so on; - see the paper "Speech Recognition + see the paper "Speech Recognition with Weighted Finite State Transducers" . We also add a symbol \#0 which replaces epsilon transitions in the language model; see \ref graph_disambig for more information. How many disambiguation symbols diff --git a/src/feat/Makefile b/src/feat/Makefile index 8b8fa5145ad..858ed714be3 100644 --- a/src/feat/Makefile +++ b/src/feat/Makefile @@ -6,16 +6,18 @@ include ../kaldi.mk TESTFILES = feature-mfcc-test feature-plp-test feature-fbank-test \ feature-functions-test pitch-functions-test feature-sdc-test \ - resample-test online-feature-test sinusoid-detection-test + resample-test online-feature-test sinusoid-detection-test \ + signal-test OBJFILES = feature-functions.o feature-mfcc.o feature-plp.o feature-fbank.o \ feature-spectrogram.o mel-computations.o wave-reader.o \ - pitch-functions.o resample.o online-feature.o sinusoid-detection.o + pitch-functions.o resample.o online-feature.o sinusoid-detection.o \ + signal.o feature-window.o LIBNAME = kaldi-feat ADDLIBS = ../transform/kaldi-transform.a ../gmm/kaldi-gmm.a ../tree/kaldi-tree.a \ - ../util/kaldi-util.a ../matrix/kaldi-matrix.a ../base/kaldi-base.a ../thread/kaldi-thread.a + ../util/kaldi-util.a ../thread/kaldi-thread.a ../matrix/kaldi-matrix.a ../base/kaldi-base.a include ../makefiles/default_rules.mk diff --git a/src/feat/feature-common-inl.h b/src/feat/feature-common-inl.h new file mode 100644 index 00000000000..a9c3c47ebbc --- /dev/null +++ b/src/feat/feature-common-inl.h @@ -0,0 +1,74 @@ +// feat/feature-common-inl.h + +// Copyright 2016 Johns Hopkins University (author: Daniel Povey) + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + +#ifndef KALDI_FEAT_FEATURE_COMMON_INL_H_ +#define KALDI_FEAT_FEATURE_COMMON_INL_H_ + +// Do not include this file directly. It is included by feat/feature-common.h + +namespace kaldi { + +template +void OfflineFeatureTpl::Compute( + const VectorBase &wave, + BaseFloat vtln_warp, + Matrix *output, + Vector *deprecated_wave_remainder) { + KALDI_ASSERT(output != NULL); + int32 rows_out = NumFrames(wave.Dim(), computer_.GetFrameOptions()), + cols_out = computer_.Dim(); + if (rows_out == 0) { + output->Resize(0, 0); + if (deprecated_wave_remainder != NULL) + *deprecated_wave_remainder = wave; + return; + } + output->Resize(rows_out, cols_out); + if (deprecated_wave_remainder != NULL) + ExtractWaveformRemainder(wave, computer_.GetFrameOptions(), + deprecated_wave_remainder); + Vector window; // windowed waveform. + bool use_raw_log_energy = computer_.NeedRawLogEnergy(); + for (int32 r = 0; r < rows_out; r++) { // r is frame index. + BaseFloat raw_log_energy = 0.0; + ExtractWindow(0, wave, r, computer_.GetFrameOptions(), + feature_window_function_, &window, + (use_raw_log_energy ? &raw_log_energy : NULL)); + + SubVector output_row(*output, r); + computer_.Compute(raw_log_energy, vtln_warp, &window, &output_row); + } +} + +template +void OfflineFeatureTpl::Compute( + const VectorBase &wave, + BaseFloat vtln_warp, + Matrix *output, + Vector *deprecated_wave_remainder) const { + OfflineFeatureTpl temp(*this); + // call the non-const version of Compute() on a temporary copy of this object. + // This is a workaround for const-ness that may sometimes be useful in + // multi-threaded code, although it's not optimally efficient. + temp.Compute(wave, vtln_warp, output, deprecated_wave_remainder); +} + +} // end namespace kaldi + +#endif diff --git a/src/feat/feature-common.h b/src/feat/feature-common.h new file mode 100644 index 00000000000..70d8f4b043e --- /dev/null +++ b/src/feat/feature-common.h @@ -0,0 +1,161 @@ +// feat/feature-common.h + +// Copyright 2016 Johns Hopkins University (author: Daniel Povey) + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABILITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + +#ifndef KALDI_FEAT_FEATURE_COMMON_H_ +#define KALDI_FEAT_FEATURE_COMMON_H_ + +#include +#include +#include "feat/feature-window.h" + +namespace kaldi { +/// @addtogroup feat FeatureCommon +/// @{ + + + +/// This class is only added for documentation, it is not intended to ever be +/// used. +struct ExampleFeatureComputerOptions { + FrameExtractionOptions frame_opts; + // .. more would go here. +}; + +/// This class is only added for documentation, it is not intended to ever be +/// used. It documents the interface of the *Computer classes which wrap the +/// low-level feature extraction. The template argument F of OfflineFeatureTpl must +/// follow this interface. This interface is intended for features such as +/// MFCCs and PLPs which can be computed frame by frame. +class ExampleFeatureComputer { + public: + typedef ExampleFeatureComputerOptions Options; + + /// Returns a reference to the frame-extraction options class, which + /// will be part of our own options class. + const FrameExtractionOptions &GetFrameOptions() const { + return opts_.frame_opts; + } + + /// Returns the feature dimension + int32 Dim(); + + /// Returns true if this function may inspect the raw log-energy of the signal + /// (before windowing and pre-emphasis); it's safe to always return true, but + /// setting it to false enables an optimization. + bool NeedRawLogEnergy() { return true; } + + /// constructor from options class; it should not store a reference or pointer + /// to the options class but should copy it. + explicit ExampleFeatureComputer(const ExampleFeatureComputerOptions &opts): + opts_(opts) { } + + /// Copy constructor; all of these classes must have one. + ExampleFeatureComputer(const ExampleFeatureComputer &other); + + /** + Function that computes one frame of features from + one frame of signal. + + @param [in] signal_raw_log_energy The log-energy of the frame of the signal + prior to windowing and pre-emphasis, or + log(numeric_limits::min()), whichever is greater. Must be + ignored by this function if this class returns false from + this->NeedRawLogEnergy(). + @param [in] vtln_warp The VTLN warping factor that the user wants + to be applied when computing features for this utterance. Will + normally be 1.0, meaning no warping is to be done. The value will + be ignored for feature types that don't support VLTN, such as + spectrogram features. + @param [in] signal_frame One frame of the signal, + as extracted using the function ExtractWindow() using the options + returned by this->GetFrameOptions(). The function will use the + vector as a workspace, which is why it's a non-const pointer. + @param [out] feature Pointer to a vector of size this->Dim(), to which + the computed feature will be written. + */ + void Compute(BaseFloat signal_log_energy, + BaseFloat vtln_warp, + VectorBase *signal_frame, + VectorBase *feature); + + private: + // disallow assignment. + ExampleFeatureComputer &operator = (const ExampleFeatureComputer &in); + Options opts_; +}; + + +/// This templated class is intended for offline feature extraction, i.e. where +/// you have access to the entire signal at the start. It exists mainly to be +/// drop-in replacement for the old (pre-2016) classes Mfcc, Plp and so on, for +/// use in the offline case. In April 2016 we reorganized the online +/// feature-computation code for greater modularity and to have correct support +/// for the snip-edges=false option. +template +class OfflineFeatureTpl { + public: + typedef typename F::Options Options; + + // Note: feature_window_function_ is the windowing function, which initialized + // using the options class, that we cache at this level. + OfflineFeatureTpl(const Options &opts): + computer_(opts), + feature_window_function_(computer_.GetFrameOptions()) { } + + // Computes the features for one file (one sequence of features). + // Use of the 'deprecatd_wave_remainder' argument is highly deprecated; it is + // only provided for back-compatibility for code that may have + // relied on the older interface. It's deprecated because it + // doesn't support the --snip-edges=false option, and because + // we plan to eventually remove this argument so that there + // will be only one way to do online feature extraction. + void Compute(const VectorBase &wave, + BaseFloat vtln_warp, + Matrix *output, + Vector *deprecated_wave_remainder = NULL); + + // This const version of Compute() is a wrapper that + // calls the non-const version on a temporary object. + // It's less efficient than the non-const version. + void Compute(const VectorBase &wave, + BaseFloat vtln_warp, + Matrix *output, + Vector *deprecated_wave_remainder = NULL) const; + + int32 Dim() const { return computer_.Dim(); } + + // Copy constructor. + OfflineFeatureTpl(const OfflineFeatureTpl &other): + computer_(other.computer_), + feature_window_function_(other.feature_window_function_) { } + private: + // Disallow assignment. + OfflineFeatureTpl &operator =(const OfflineFeatureTpl &other); + + F computer_; + FeatureWindowFunction feature_window_function_; +}; + +/// @} End of "addtogroup feat" +} // namespace kaldi + + +#include "feat/feature-common-inl.h" + +#endif // KALDI_FEAT_FEATURE_COMMON_H_ diff --git a/src/feat/feature-fbank.cc b/src/feat/feature-fbank.cc index bac61ed2059..6b8d49e9403 100644 --- a/src/feat/feature-fbank.cc +++ b/src/feat/feature-fbank.cc @@ -1,6 +1,7 @@ // feat/feature-fbank.cc // Copyright 2009-2012 Karel Vesely +// 2016 Johns Hopkins University (author: Daniel Povey) // See ../../COPYING for clarification regarding multiple authors // @@ -20,11 +21,10 @@ #include "feat/feature-fbank.h" - namespace kaldi { -Fbank::Fbank(const FbankOptions &opts) - : opts_(opts), feature_window_function_(opts.frame_opts), srfft_(NULL) { +FbankComputer::FbankComputer(const FbankOptions &opts): + opts_(opts), srfft_(NULL) { if (opts.energy_floor > 0.0) log_energy_floor_ = Log(opts.energy_floor); @@ -33,21 +33,29 @@ Fbank::Fbank(const FbankOptions &opts) srfft_ = new SplitRadixRealFft(padded_window_size); // We'll definitely need the filterbanks info for VTLN warping factor 1.0. - // [note: this call caches it.] The reason we call this here is to - // improve the efficiency of the "const" version of Compute(). + // [note: this call caches it.] GetMelBanks(1.0); } -Fbank::~Fbank() { +FbankComputer::FbankComputer(const FbankComputer &other): + opts_(other.opts_), log_energy_floor_(other.log_energy_floor_), + mel_banks_(other.mel_banks_), srfft_(NULL) { for (std::map::iterator iter = mel_banks_.begin(); iter != mel_banks_.end(); ++iter) + iter->second = new MelBanks(*(iter->second)); + if (other.srfft_) + srfft_ = new SplitRadixRealFft(*(other.srfft_)); +} + +FbankComputer::~FbankComputer() { + for (std::map::iterator iter = mel_banks_.begin(); + iter != mel_banks_.end(); ++iter) delete iter->second; - if (srfft_ != NULL) - delete srfft_; + delete srfft_; } -const MelBanks *Fbank::GetMelBanks(BaseFloat vtln_warp) { +const MelBanks* FbankComputer::GetMelBanks(BaseFloat vtln_warp) { MelBanks *this_mel_banks = NULL; std::map::iterator iter = mel_banks_.find(vtln_warp); if (iter == mel_banks_.end()) { @@ -61,124 +69,52 @@ const MelBanks *Fbank::GetMelBanks(BaseFloat vtln_warp) { return this_mel_banks; } -const MelBanks *Fbank::GetMelBanks(BaseFloat vtln_warp, - bool *must_delete) const { - MelBanks *this_mel_banks = NULL; - std::map::const_iterator iter = - mel_banks_.find(vtln_warp); - if (iter == mel_banks_.end()) { - this_mel_banks = new MelBanks(opts_.mel_opts, - opts_.frame_opts, - vtln_warp); - *must_delete = true; - } else { - this_mel_banks = iter->second; - *must_delete = false; - } - return this_mel_banks; -} +void FbankComputer::Compute(BaseFloat signal_log_energy, + BaseFloat vtln_warp, + VectorBase *signal_frame, + VectorBase *feature) { -void Fbank::Compute(const VectorBase &wave, - BaseFloat vtln_warp, - Matrix *output, - Vector *wave_remainder) { - const MelBanks *this_mel_banks = GetMelBanks(vtln_warp); - ComputeInternal(wave, *this_mel_banks, output, wave_remainder); -} + const MelBanks &mel_banks = *(GetMelBanks(vtln_warp)); -void Fbank::Compute(const VectorBase &wave, - BaseFloat vtln_warp, - Matrix *output, - Vector *wave_remainder) const { - bool must_delete_mel_banks; - const MelBanks *mel_banks = GetMelBanks(vtln_warp, - &must_delete_mel_banks); - - ComputeInternal(wave, *mel_banks, output, wave_remainder); - - if (must_delete_mel_banks) - delete mel_banks; -} + KALDI_ASSERT(signal_frame->Dim() == opts_.frame_opts.PaddedWindowSize() && + feature->Dim() == this->Dim()); -void Fbank::ComputeInternal(const VectorBase &wave, - const MelBanks &mel_banks, - Matrix *output, - Vector *wave_remainder) const { - KALDI_ASSERT(output != NULL); + // Compute energy after window function (not the raw one). + if (opts_.use_energy && !opts_.raw_energy) + signal_log_energy = Log(std::max(VecVec(*signal_frame, *signal_frame), + std::numeric_limits::min())); - // Get dimensions of output features - int32 rows_out = NumFrames(wave.Dim(), opts_.frame_opts); - int32 cols_out = opts_.mel_opts.num_bins + opts_.use_energy; - if (rows_out == 0) { - output->Resize(0, 0); - *wave_remainder = wave; - return; - } - // Prepare the output buffer - output->Resize(rows_out, cols_out); - - // Optionally extract the remainder for further processing - if (wave_remainder != NULL) - ExtractWaveformRemainder(wave, opts_.frame_opts, wave_remainder); - - // Buffers - Vector window; // windowed waveform. - Vector mel_energies; - std::vector temp_buffer; // used by srfft. - BaseFloat log_energy; - - // Compute all the freames, r is frame index.. - for (int32 r = 0; r < rows_out; r++) { - // Cut the window, apply window function - ExtractWindow(wave, r, opts_.frame_opts, feature_window_function_, &window, - (opts_.use_energy && opts_.raw_energy ? &log_energy : NULL)); - - // Compute energy after window function (not the raw one) - if (opts_.use_energy && !opts_.raw_energy) - log_energy = Log(std::max(VecVec(window, window), - std::numeric_limits::min())); - - if (srfft_ != NULL) // Compute FFT using split-radix algorithm. - srfft_->Compute(window.Data(), true, &temp_buffer); - else // An alternative algorithm that works for non-powers-of-two. - RealFft(&window, true); - - // Convert the FFT into a power spectrum. - ComputePowerSpectrum(&window); - SubVector power_spectrum(window, 0, window.Dim()/2 + 1); - - // Sum with MelFiterbank over power spectrum - mel_banks.Compute(power_spectrum, &mel_energies); - if (opts_.use_log_fbank) { - // avoid log of zero (which should be prevented anyway by dithering). - mel_energies.ApplyFloor(std::numeric_limits::min()); - mel_energies.ApplyLog(); // take the log. - } + if (srfft_ != NULL) // Compute FFT using split-radix algorithm. + srfft_->Compute(signal_frame->Data(), true); + else // An alternative algorithm that works for non-powers-of-two. + RealFft(signal_frame, true); - // Output buffers - SubVector this_output(output->Row(r)); - SubVector this_fbank(this_output.Range((opts_.use_energy? 1 : 0), - opts_.mel_opts.num_bins)); - - // Copy to output - this_fbank.CopyFromVec(mel_energies); - // Copy energy as first value - if (opts_.use_energy) { - if (opts_.energy_floor > 0.0 && log_energy < log_energy_floor_) { - log_energy = log_energy_floor_; - } - this_output(0) = log_energy; - } + // Convert the FFT into a power spectrum. + ComputePowerSpectrum(signal_frame); + SubVector power_spectrum(*signal_frame, 0, + signal_frame->Dim() / 2 + 1); + + int32 mel_offset = ((opts_.use_energy && !opts_.htk_compat) ? 1 : 0); + SubVector mel_energies(*feature, + mel_offset, + opts_.mel_opts.num_bins); + + // Sum with mel fiterbanks over the power spectrum + mel_banks.Compute(power_spectrum, &mel_energies); + if (opts_.use_log_fbank) { + // Avoid log of zero (which should be prevented anyway by dithering). + mel_energies.ApplyFloor(std::numeric_limits::epsilon()); + mel_energies.ApplyLog(); // take the log. + } - // HTK compat: Shift features, so energy is last value - if (opts_.htk_compat && opts_.use_energy) { - BaseFloat energy = this_output(0); - for (int32 i = 0; i < opts_.mel_opts.num_bins; i++) { - this_output(i) = this_output(i+1); - } - this_output(opts_.mel_opts.num_bins) = energy; + // Copy energy as first value (or the last, if htk_compat == true). + if (opts_.use_energy) { + if (opts_.energy_floor > 0.0 && signal_log_energy < log_energy_floor_) { + signal_log_energy = log_energy_floor_; } + int32 energy_index = opts_.htk_compat ? opts_.mel_opts.num_bins : 0; + (*feature)(energy_index) = signal_log_energy; } } diff --git a/src/feat/feature-fbank.h b/src/feat/feature-fbank.h index 966f305ea6c..b93ed6f58cf 100644 --- a/src/feat/feature-fbank.h +++ b/src/feat/feature-fbank.h @@ -1,6 +1,7 @@ // feat/feature-fbank.h // Copyright 2009-2012 Karel Vesely +// 2016 Johns Hopkins University (author: Daniel Povey) // See ../../COPYING for clarification regarding multiple authors // @@ -23,14 +24,17 @@ #include #include +#include "feat/feature-common.h" #include "feat/feature-functions.h" +#include "feat/feature-window.h" +#include "feat/mel-computations.h" namespace kaldi { /// @addtogroup feat FeatureExtraction /// @{ -/// FbankOptions contains basic options for computing FBANK features +/// FbankOptions contains basic options for computing filterbank features. /// It only includes things that can be done in a "stateless" way, i.e. /// it does not include energy max-normalization. /// It does not include delta computation. @@ -42,7 +46,7 @@ struct FbankOptions { bool raw_energy; // If true, compute energy before preemphasis and windowing bool htk_compat; // If true, put energy last (if using energy) bool use_log_fbank; // if true (default), produce log-filterbank, else linear - + FbankOptions(): mel_opts(23), // defaults the #mel-banks to 23 for the FBANK computations. // this seems to be common for 16khz-sampled data, @@ -70,54 +74,67 @@ struct FbankOptions { } }; -class MelBanks; - /// Class for computing mel-filterbank features; see \ref feat_mfcc for more /// information. -class Fbank { +class FbankComputer { public: - explicit Fbank(const FbankOptions &opts); - ~Fbank(); - - int32 Dim() const { return opts_.mel_opts.num_bins; } - - /// Will throw exception on failure (e.g. if file too short for even one - /// frame). The output "wave_remainder" is the last frame or two of the - /// waveform that it would be necessary to include in the next call to Compute - /// for the same utterance. It is not exactly the un-processed part (it may - /// have been partly processed), it's the start of the next window that we - /// have not already processed. - void Compute(const VectorBase &wave, - BaseFloat vtln_warp, - Matrix *output, - Vector *wave_remainder = NULL); - - /// Const version of Compute() - void Compute(const VectorBase &wave, - BaseFloat vtln_warp, - Matrix *output, - Vector *wave_remainder = NULL) const; typedef FbankOptions Options; + + explicit FbankComputer(const FbankOptions &opts); + FbankComputer(const FbankComputer &other); + + int32 Dim() const { + return opts_.mel_opts.num_bins + (opts_.use_energy ? 1 : 0); + } + + bool NeedRawLogEnergy() { return opts_.use_energy && opts_.raw_energy; } + + const FrameExtractionOptions &GetFrameOptions() const { + return opts_.frame_opts; + } + + /** + Function that computes one frame of features from + one frame of signal. + + @param [in] signal_raw_log_energy The log-energy of the frame of the signal + prior to windowing and pre-emphasis, or + log(numeric_limits::min()), whichever is greater. Must be + ignored by this function if this class returns false from + this->NeedsRawLogEnergy(). + @param [in] vtln_warp The VTLN warping factor that the user wants + to be applied when computing features for this utterance. Will + normally be 1.0, meaning no warping is to be done. The value will + be ignored for feature types that don't support VLTN, such as + spectrogram features. + @param [in] signal_frame One frame of the signal, + as extracted using the function ExtractWindow() using the options + returned by this->GetFrameOptions(). The function will use the + vector as a workspace, which is why it's a non-const pointer. + @param [out] feature Pointer to a vector of size this->Dim(), to which + the computed feature will be written. + */ + void Compute(BaseFloat signal_log_energy, + BaseFloat vtln_warp, + VectorBase *signal_frame, + VectorBase *feature); + + ~FbankComputer(); + private: - void ComputeInternal(const VectorBase &wave, - const MelBanks &mel_banks, - Matrix *output, - Vector *wave_remainder = NULL) const; - const MelBanks *GetMelBanks(BaseFloat vtln_warp); - const MelBanks *GetMelBanks(BaseFloat vtln_warp, - bool *must_delete) const; FbankOptions opts_; BaseFloat log_energy_floor_; std::map mel_banks_; // BaseFloat is VTLN coefficient. - FeatureWindowFunction feature_window_function_; SplitRadixRealFft *srfft_; - KALDI_DISALLOW_COPY_AND_ASSIGN(Fbank); + // Disallow assignment. + FbankComputer &operator =(const FbankComputer &other); }; +typedef OfflineFeatureTpl Fbank; /// @} End of "addtogroup feat" } // namespace kaldi diff --git a/src/feat/feature-functions.cc b/src/feat/feature-functions.cc index 9678e909a5a..b8a7b3178f2 100644 --- a/src/feat/feature-functions.cc +++ b/src/feat/feature-functions.cc @@ -26,162 +26,6 @@ namespace kaldi { -int32 NumFrames(int32 nsamp, - const FrameExtractionOptions &opts) { - int32 frame_shift = opts.WindowShift(); - int32 frame_length = opts.WindowSize(); - KALDI_ASSERT(frame_shift != 0 && frame_length != 0); - if (opts.snip_edges) { - if (static_cast(nsamp) < frame_length) - return 0; - else - return (1 + ((nsamp - frame_length) / frame_shift)); - // view the expression above as: nsamp-frame_length is how much room we - // have to shift the frame within the waveform; frame_shift is how much - // we shift it each time and the ratio is how many times we can shift - // it (integer arithmetic rounds down). - } else { - return (int32)(nsamp * 1.0f / frame_shift + 0.5f); - // if --snip-edges=false, the number of frames would be determined by - // rounding the (file-length / frame-shift) to the nearest integer - } -} - - -void Dither(VectorBase *waveform, BaseFloat dither_value) { - for (int32 i = 0; i < waveform->Dim(); i++) - (*waveform)(i) += RandGauss() * dither_value; -} - - -void Preemphasize(VectorBase *waveform, BaseFloat preemph_coeff) { - if (preemph_coeff == 0.0) return; - KALDI_ASSERT(preemph_coeff >= 0.0 && preemph_coeff <= 1.0); - for (int32 i = waveform->Dim()-1; i > 0; i--) - (*waveform)(i) -= preemph_coeff * (*waveform)(i-1); - (*waveform)(0) -= preemph_coeff * (*waveform)(0); -} - - - -FeatureWindowFunction::FeatureWindowFunction(const FrameExtractionOptions &opts) { - int32 frame_length = opts.WindowSize(); - KALDI_ASSERT(frame_length > 0); - window.Resize(frame_length); - for (int32 i = 0; i < frame_length; i++) { - BaseFloat i_fl = static_cast(i); - if (opts.window_type == "hanning") { - window(i) = 0.5 - 0.5*cos(M_2PI * i_fl / (frame_length-1)); - } else if (opts.window_type == "hamming") { - window(i) = 0.54 - 0.46*cos(M_2PI * i_fl / (frame_length-1)); - } else if (opts.window_type == "povey") { // like hamming but goes to zero at edges. - window(i) = pow(0.5 - 0.5*cos(M_2PI * i_fl / (frame_length-1)), 0.85); - } else if (opts.window_type == "rectangular") { - window(i) = 1.0; - } else { - KALDI_ERR << "Invalid window type " << opts.window_type; - } - } -} - -// ExtractWindow extracts a windowed frame of waveform with a power-of-two, -// padded size. It does mean subtraction, pre-emphasis and dithering as -// requested. - -void ExtractWindow(const VectorBase &wave, - int32 f, // with 0 <= f < NumFrames(feats, opts) - const FrameExtractionOptions &opts, - const FeatureWindowFunction &window_function, - Vector *window, - BaseFloat *log_energy_pre_window) { - int32 frame_shift = opts.WindowShift(); - int32 frame_length = opts.WindowSize(); - KALDI_ASSERT(window_function.window.Dim() == frame_length); - KALDI_ASSERT(frame_shift != 0 && frame_length != 0); - - Vector wave_part(frame_length); - if (opts.snip_edges) { - int32 start = frame_shift*f, end = start + frame_length; - KALDI_ASSERT(start >= 0 && end <= wave.Dim()); - wave_part.CopyFromVec(wave.Range(start, frame_length)); - } else { - // If opts.snip_edges = false, we allow the frames to go slightly over the - // edges of the file; we'll extend the data by reflection. - int32 mid = frame_shift * (f + 0.5), - begin = mid - frame_length / 2, - end = begin + frame_length, - begin_limited = std::max(0, begin), - end_limited = std::min(end, wave.Dim()), - length_limited = end_limited - begin_limited; - - // Copy the main part. Usually this will be the entire window. - wave_part.Range(begin_limited - begin, length_limited). - CopyFromVec(wave.Range(begin_limited, length_limited)); - - // Deal with any end effects by reflection, if needed. This code will - // rarely be reached, so we don't concern ourselves with efficiency. - for (int32 f = begin; f < 0; f++) { - int32 reflected_f = -f; - // The next statement will only have an effect in the case of files - // shorter than a single frame, it's to avoid a crash in those cases. - reflected_f = reflected_f % wave.Dim(); - wave_part(f - begin) = wave(reflected_f); - } - for (int32 f = wave.Dim(); f < end; f++) { - int32 distance_to_end = f - wave.Dim(); - // The next statement will only have an effect in the case of files - // shorter than a single frame, it's to avoid a crash in those cases. - distance_to_end = distance_to_end % wave.Dim(); - int32 reflected_f = wave.Dim() - 1 - distance_to_end; - wave_part(f - begin) = wave(reflected_f); - } - } - KALDI_ASSERT(window != NULL); - int32 frame_length_padded = opts.PaddedWindowSize(); - - if (window->Dim() != frame_length_padded) - window->Resize(frame_length_padded); - - SubVector window_part(*window, 0, frame_length); - window_part.CopyFromVec(wave_part); - - if (opts.dither != 0.0) Dither(&window_part, opts.dither); - - if (opts.remove_dc_offset) - window_part.Add(-window_part.Sum() / frame_length); - - if (log_energy_pre_window != NULL) { - BaseFloat energy = std::max(VecVec(window_part, window_part), - std::numeric_limits::min()); - *log_energy_pre_window = Log(energy); - } - - if (opts.preemph_coeff != 0.0) - Preemphasize(&window_part, opts.preemph_coeff); - - window_part.MulElements(window_function.window); - - if (frame_length != frame_length_padded) - SubVector(*window, frame_length, - frame_length_padded-frame_length).SetZero(); -} - -void ExtractWaveformRemainder(const VectorBase &wave, - const FrameExtractionOptions &opts, - Vector *wave_remainder) { - int32 frame_shift = opts.WindowShift(); - int32 num_frames = NumFrames(wave.Dim(), opts); - // offset is the amount at the start that has been extracted. - int32 offset = num_frames * frame_shift; - KALDI_ASSERT(wave_remainder != NULL); - int32 remaining_len = wave.Dim() - offset; - wave_remainder->Resize(remaining_len); - KALDI_ASSERT(remaining_len >= 0); - if (remaining_len > 0) - wave_remainder->CopyFromVec(SubVector(wave, offset, remaining_len)); -} - - void ComputePowerSpectrum(VectorBase *waveform) { int32 dim = waveform->Dim(); @@ -341,22 +185,6 @@ void ComputeShiftedDeltas(const ShiftedDeltaFeaturesOptions &delta_opts, } - - - -void GetEqualLoudnessVector(const MelBanks &mel_banks, - Vector *ans) { - int32 n = mel_banks.NumBins(); - // central freq of each mel bin - const Vector &f0 = mel_banks.GetCenterFreqs(); - ans->Resize(n); - for (int32 i = 0; i < n; i++) { - BaseFloat fsq = f0(i) * f0(i); - BaseFloat fsub = fsq / (fsq + 1.6e5); - (*ans)(i) = fsub * fsub * ((fsq + 1.44e6) / (fsq + 9.61e6)); - } -} - void InitIdftBases(int32 n_bases, int32 dimension, Matrix *mat_out) { BaseFloat angle = M_PI / static_cast(dimension - 1); BaseFloat scale = 1.0f / (2.0 * static_cast(dimension - 1)); @@ -374,20 +202,6 @@ void InitIdftBases(int32 n_bases, int32 dimension, Matrix *mat_out) { } } -// Compute LP coefficients from autocorrelation coefficients. -BaseFloat ComputeLpc(const VectorBase &autocorr_in, - Vector *lpc_out) { - int32 n = autocorr_in.Dim() - 1; - KALDI_ASSERT(lpc_out->Dim() == n); - Vector tmp(n); - BaseFloat ans = Durbin(n, autocorr_in.Data(), - lpc_out->Data(), - tmp.Data()); - if (ans <= 0.0) - KALDI_WARN << "Zero energy in LPC computation"; - return -Log((double)1.0/ans); // forms the C0 value -} - void SpliceFrames(const MatrixBase &input_features, int32 left_context, int32 right_context, diff --git a/src/feat/feature-functions.h b/src/feat/feature-functions.h index c5dfe9a3010..42a9703757f 100644 --- a/src/feat/feature-functions.h +++ b/src/feat/feature-functions.h @@ -2,6 +2,7 @@ // Copyright 2009-2011 Karel Vesely; Petr Motlicek; Microsoft Corporation // 2014 IMSL, PKU-HKUST (author: Wei Shi) +// 2016 Johns Hopkins University (author: Daniel Povey) // See ../../COPYING for clarification regarding multiple authors // @@ -28,141 +29,12 @@ #include "matrix/matrix-lib.h" #include "util/common-utils.h" #include "base/kaldi-error.h" -#include "feat/mel-computations.h" namespace kaldi { /// @addtogroup feat FeatureExtraction /// @{ -struct MelBanksOptions { - int32 num_bins; // e.g. 25; number of triangular bins - BaseFloat low_freq; // e.g. 20; lower frequency cutoff - BaseFloat high_freq; // an upper frequency cutoff; 0 -> no cutoff, negative - // ->added to the Nyquist frequency to get the cutoff. - BaseFloat vtln_low; // vtln lower cutoff of warping function. - BaseFloat vtln_high; // vtln upper cutoff of warping function: if negative, added - // to the Nyquist frequency to get the cutoff. - bool debug_mel; - // htk_mode is a "hidden" config, it does not show up on command line. - // Enables more exact compatibibility with HTK, for testing purposes. Affects - // mel-energy flooring and reproduces a bug in HTK. - bool htk_mode; - explicit MelBanksOptions(int num_bins = 25) - : num_bins(num_bins), low_freq(20), high_freq(0), vtln_low(100), - vtln_high(-500), debug_mel(false), htk_mode(false) {} - - void Register(OptionsItf *opts) { - opts->Register("num-mel-bins", &num_bins, - "Number of triangular mel-frequency bins"); - opts->Register("low-freq", &low_freq, - "Low cutoff frequency for mel bins"); - opts->Register("high-freq", &high_freq, - "High cutoff frequency for mel bins (if < 0, offset from Nyquist)"); - opts->Register("vtln-low", &vtln_low, - "Low inflection point in piecewise linear VTLN warping function"); - opts->Register("vtln-high", &vtln_high, - "High inflection point in piecewise linear VTLN warping function" - " (if negative, offset from high-mel-freq"); - opts->Register("debug-mel", &debug_mel, - "Print out debugging information for mel bin computation"); - } -}; - - -struct FrameExtractionOptions { - BaseFloat samp_freq; - BaseFloat frame_shift_ms; // in milliseconds. - BaseFloat frame_length_ms; // in milliseconds. - BaseFloat dither; // Amount of dithering, 0.0 means no dither. - BaseFloat preemph_coeff; // Preemphasis coefficient. - bool remove_dc_offset; // Subtract mean of wave before FFT. - std::string window_type; // e.g. Hamming window - bool round_to_power_of_two; - bool snip_edges; - // Maybe "hamming", "rectangular", "povey", "hanning" - // "povey" is a window I made to be similar to Hamming but to go to zero at the - // edges, it's pow((0.5 - 0.5*cos(n/N*2*pi)), 0.85) - // I just don't think the Hamming window makes sense as a windowing function. - FrameExtractionOptions(): - samp_freq(16000), - frame_shift_ms(10.0), - frame_length_ms(25.0), - dither(1.0), - preemph_coeff(0.97), - remove_dc_offset(true), - window_type("povey"), - round_to_power_of_two(true), - snip_edges(true){ } - - void Register(OptionsItf *opts) { - opts->Register("sample-frequency", &samp_freq, - "Waveform data sample frequency (must match the waveform file, " - "if specified there)"); - opts->Register("frame-length", &frame_length_ms, "Frame length in milliseconds"); - opts->Register("frame-shift", &frame_shift_ms, "Frame shift in milliseconds"); - opts->Register("preemphasis-coefficient", &preemph_coeff, - "Coefficient for use in signal preemphasis"); - opts->Register("remove-dc-offset", &remove_dc_offset, - "Subtract mean from waveform on each frame"); - opts->Register("dither", &dither, "Dithering constant (0.0 means no dither)"); - opts->Register("window-type", &window_type, "Type of window " - "(\"hamming\"|\"hanning\"|\"povey\"|\"rectangular\")"); - opts->Register("round-to-power-of-two", &round_to_power_of_two, - "If true, round window size to power of two."); - opts->Register("snip-edges", &snip_edges, - "If true, end effects will be handled by outputting only frames that " - "completely fit in the file, and the number of frames depends on the " - "frame-length. If false, the number of frames depends only on the " - "frame-shift, and we reflect the data at the ends."); - } - int32 WindowShift() const { - return static_cast(samp_freq * 0.001 * frame_shift_ms); - } - int32 WindowSize() const { - return static_cast(samp_freq * 0.001 * frame_length_ms); - } - int32 PaddedWindowSize() const { - return (round_to_power_of_two ? RoundUpToNearestPowerOfTwo(WindowSize()) : - WindowSize()); - } -}; - - -struct FeatureWindowFunction { - FeatureWindowFunction() {} - explicit FeatureWindowFunction(const FrameExtractionOptions &opts); - Vector window; -}; - -int32 NumFrames(int32 wave_length, - const FrameExtractionOptions &opts); - -void Dither(VectorBase *waveform, BaseFloat dither_value); - -void Preemphasize(VectorBase *waveform, BaseFloat preemph_coeff); - - -// ExtractWindow extracts a windowed frame of waveform with a power-of-two, -// padded size. If log_energy_pre_window != NULL, outputs the log of the -// sum-of-squared samples before preemphasis and windowing -void ExtractWindow(const VectorBase &wave, - int32 f, // with 0 <= f < NumFrames(wave.Dim(), opts) - const FrameExtractionOptions &opts, - const FeatureWindowFunction &window_function, - Vector *window, - BaseFloat *log_energy_pre_window = NULL); - -// ExtractWaveformRemainder is useful if the waveform is coming in segments. -// It extracts the bit of the waveform at the end of this block that you -// would have to append the next bit of waveform to, if you wanted to have -// the same effect as everything being in one big block. -void ExtractWaveformRemainder(const VectorBase &wave, - const FrameExtractionOptions &opts, - Vector *wave_remainder); - - - // ComputePowerSpectrum converts a complex FFT (as produced by the FFT // functions in matrix/matrix-functions.h), and converts it into // a power spectrum. If the complex FFT is a vector of size n (representing @@ -173,22 +45,6 @@ void ExtractWaveformRemainder(const VectorBase &wave, void ComputePowerSpectrum(VectorBase *complex_fft); - -inline void MaxNormalizeEnergy(Matrix *feats) { - // Just subtract the largest energy value... assume energy is the first - // column of the mfcc features. Don't do the flooring of energy (dithering - // should prevent exact zeros). - // We didn't put this in the main MFCC computation as we wanted to make sure - // it is stateless (so we can do it bit by bit for large waveforms). - // not compatible with the order_as_htk_ option in MfccOptions. - SubMatrix energy(*feats, 0, feats->NumRows(), 0, 1); - energy.Add(-energy.Max()); -} - - - - - struct DeltaFeaturesOptions { int32 order; int32 window; // e.g. 2; controls window size (window size is 2*window + 1) @@ -293,19 +149,10 @@ void SpliceFrames(const MatrixBase &input_features, void ReverseFrames(const MatrixBase &input_features, Matrix *output_features); -class MelBanks; - -void GetEqualLoudnessVector(const MelBanks &mel_banks, - Vector *ans); - void InitIdftBases(int32 n_bases, int32 dimension, Matrix *mat_out); -// Compute LP coefficients from autocorrelation coefficients. -BaseFloat ComputeLpc(const VectorBase &autocorr_in, - Vector *lpc_out); - // This is used for speaker-id. Also see OnlineCmnOptions in ../online2/, which // is online CMN with no latency, for online speech recognition. struct SlidingWindowCmnOptions { diff --git a/src/feat/feature-mfcc.cc b/src/feat/feature-mfcc.cc index 135152733d9..c1962a5c1d1 100644 --- a/src/feat/feature-mfcc.cc +++ b/src/feat/feature-mfcc.cc @@ -1,6 +1,7 @@ // feat/feature-mfcc.cc // Copyright 2009-2011 Karel Vesely; Petr Motlicek +// 2016 Johns Hopkins University (author: Daniel Povey) // See ../../COPYING for clarification regarding multiple authors // @@ -23,8 +24,64 @@ namespace kaldi { -Mfcc::Mfcc(const MfccOptions &opts) - : opts_(opts), feature_window_function_(opts.frame_opts), srfft_(NULL) { + +void MfccComputer::Compute(BaseFloat signal_log_energy, + BaseFloat vtln_warp, + VectorBase *signal_frame, + VectorBase *feature) { + KALDI_ASSERT(signal_frame->Dim() == opts_.frame_opts.PaddedWindowSize() && + feature->Dim() == this->Dim()); + + const MelBanks &mel_banks = *(GetMelBanks(vtln_warp)); + + if (opts_.use_energy && !opts_.raw_energy) + signal_log_energy = Log(std::max(VecVec(*signal_frame, *signal_frame), + std::numeric_limits::min())); + + if (srfft_ != NULL) // Compute FFT using the split-radix algorithm. + srfft_->Compute(signal_frame->Data(), true); + else // An alternative algorithm that works for non-powers-of-two. + RealFft(signal_frame, true); + + // Convert the FFT into a power spectrum. + ComputePowerSpectrum(signal_frame); + SubVector power_spectrum(*signal_frame, 0, + signal_frame->Dim() / 2 + 1); + + mel_banks.Compute(power_spectrum, &mel_energies_); + + // avoid log of zero (which should be prevented anyway by dithering). + mel_energies_.ApplyFloor(std::numeric_limits::epsilon()); + mel_energies_.ApplyLog(); // take the log. + + feature->SetZero(); // in case there were NaNs. + // feature = dct_matrix_ * mel_energies [which now have log] + feature->AddMatVec(1.0, dct_matrix_, kNoTrans, mel_energies_, 0.0); + + if (opts_.cepstral_lifter != 0.0) + feature->MulElements(lifter_coeffs_); + + if (opts_.use_energy) { + if (opts_.energy_floor > 0.0 && signal_log_energy < log_energy_floor_) + signal_log_energy = log_energy_floor_; + (*feature)(0) = signal_log_energy; + } + + if (opts_.htk_compat) { + BaseFloat energy = (*feature)(0); + for (int32 i = 0; i < opts_.num_ceps - 1; i++) + (*feature)(i) = (*feature)(i+1); + if (!opts_.use_energy) + energy *= M_SQRT2; // scale on C0 (actually removing a scale + // we previously added that's part of one common definition of + // the cosine transform.) + (*feature)(opts_.num_ceps - 1) = energy; + } +} + +MfccComputer::MfccComputer(const MfccOptions &opts): + opts_(opts), srfft_(NULL), + mel_energies_(opts.mel_opts.num_bins) { int32 num_bins = opts.mel_opts.num_bins; Matrix dct_matrix(num_bins, num_bins); ComputeDctMatrix(&dct_matrix); @@ -44,23 +101,37 @@ Mfcc::Mfcc(const MfccOptions &opts) int32 padded_window_size = opts.frame_opts.PaddedWindowSize(); if ((padded_window_size & (padded_window_size-1)) == 0) // Is a power of two... srfft_ = new SplitRadixRealFft(padded_window_size); - + // We'll definitely need the filterbanks info for VTLN warping factor 1.0. - // [note: this call caches it.] The reason we call this here is to - // improve the efficiency of the "const" version of Compute(). + // [note: this call caches it.] GetMelBanks(1.0); } -Mfcc::~Mfcc() { +MfccComputer::MfccComputer(const MfccComputer &other): + opts_(other.opts_), lifter_coeffs_(other.lifter_coeffs_), + dct_matrix_(other.dct_matrix_), + log_energy_floor_(other.log_energy_floor_), + mel_banks_(other.mel_banks_), + srfft_(NULL), + mel_energies_(other.mel_energies_.Dim(), kUndefined) { + for (std::map::iterator iter = mel_banks_.begin(); + iter != mel_banks_.end(); ++iter) + iter->second = new MelBanks(*(iter->second)); + if (other.srfft_ != NULL) + srfft_ = new SplitRadixRealFft(*(other.srfft_)); +} + + + +MfccComputer::~MfccComputer() { for (std::map::iterator iter = mel_banks_.begin(); iter != mel_banks_.end(); ++iter) delete iter->second; - if (srfft_ != NULL) - delete srfft_; + delete srfft_; } -const MelBanks *Mfcc::GetMelBanks(BaseFloat vtln_warp) { +const MelBanks *MfccComputer::GetMelBanks(BaseFloat vtln_warp) { MelBanks *this_mel_banks = NULL; std::map::iterator iter = mel_banks_.find(vtln_warp); if (iter == mel_banks_.end()) { @@ -75,117 +146,5 @@ const MelBanks *Mfcc::GetMelBanks(BaseFloat vtln_warp) { } -const MelBanks *Mfcc::GetMelBanks(BaseFloat vtln_warp, bool *must_delete) const { - MelBanks *this_mel_banks = NULL; - std::map::const_iterator iter = - mel_banks_.find(vtln_warp); - if (iter == mel_banks_.end()) { - this_mel_banks = new MelBanks(opts_.mel_opts, - opts_.frame_opts, - vtln_warp); - *must_delete = true; - } else { - this_mel_banks = iter->second; - *must_delete = false; - } - return this_mel_banks; -} - - -void Mfcc::Compute(const VectorBase &wave, - BaseFloat vtln_warp, - Matrix *output, - Vector *wave_remainder) { - const MelBanks *this_mel_banks = GetMelBanks(vtln_warp); - ComputeInternal(wave, *this_mel_banks, output, wave_remainder); -} - -void Mfcc::Compute(const VectorBase &wave, - BaseFloat vtln_warp, - Matrix *output, - Vector *wave_remainder) const { - bool must_delete_mel_banks; - const MelBanks *mel_banks = GetMelBanks(vtln_warp, - &must_delete_mel_banks); - - ComputeInternal(wave, *mel_banks, output, wave_remainder); - - if (must_delete_mel_banks) - delete mel_banks; -} - -void Mfcc::ComputeInternal(const VectorBase &wave, - const MelBanks &mel_banks, - Matrix *output, - Vector *wave_remainder) const { - KALDI_ASSERT(output != NULL); - int32 rows_out = NumFrames(wave.Dim(), opts_.frame_opts), - cols_out = opts_.num_ceps; - if (rows_out == 0) { - output->Resize(0, 0); - *wave_remainder = wave; - return; - } - output->Resize(rows_out, cols_out); - if (wave_remainder != NULL) - ExtractWaveformRemainder(wave, opts_.frame_opts, wave_remainder); - Vector window; // windowed waveform. - Vector mel_energies; - std::vector temp_buffer; // used by srfft. - for (int32 r = 0; r < rows_out; r++) { // r is frame index.. - BaseFloat log_energy; - ExtractWindow(wave, r, opts_.frame_opts, feature_window_function_, &window, - (opts_.use_energy && opts_.raw_energy ? &log_energy : NULL)); - - if (opts_.use_energy && !opts_.raw_energy) - log_energy = Log(std::max(VecVec(window, window), - std::numeric_limits::min())); - - if (srfft_ != NULL) // Compute FFT using the split-radix algorithm. - srfft_->Compute(window.Data(), true, &temp_buffer); - else // An alternative algorithm that works for non-powers-of-two. - RealFft(&window, true); - - // Convert the FFT into a power spectrum. - ComputePowerSpectrum(&window); - SubVector power_spectrum(window, 0, window.Dim()/2 + 1); - - mel_banks.Compute(power_spectrum, &mel_energies); - - // avoid log of zero (which should be prevented anyway by dithering). - mel_energies.ApplyFloor(std::numeric_limits::min()); - mel_energies.ApplyLog(); // take the log. - - SubVector this_mfcc(output->Row(r)); - - // this_mfcc = dct_matrix_ * mel_energies [which now have log] - this_mfcc.AddMatVec(1.0, dct_matrix_, kNoTrans, mel_energies, 0.0); - - if (opts_.cepstral_lifter != 0.0) - this_mfcc.MulElements(lifter_coeffs_); - - if (opts_.use_energy) { - if (opts_.energy_floor > 0.0 && log_energy < log_energy_floor_) - log_energy = log_energy_floor_; - this_mfcc(0) = log_energy; - } - - if (opts_.htk_compat) { - BaseFloat energy = this_mfcc(0); - for (int32 i = 0; i < opts_.num_ceps-1; i++) - this_mfcc(i) = this_mfcc(i+1); - if (!opts_.use_energy) - energy *= M_SQRT2; // scale on C0 (actually removing scale - // we previously added that's part of one common definition of - // cosine transform.) - this_mfcc(opts_.num_ceps-1) = energy; - } - } -} - - - - - } // namespace kaldi diff --git a/src/feat/feature-mfcc.h b/src/feat/feature-mfcc.h index 1f814333390..d1d2b8f9d09 100644 --- a/src/feat/feature-mfcc.h +++ b/src/feat/feature-mfcc.h @@ -1,6 +1,7 @@ // feat/feature-mfcc.h // Copyright 2009-2011 Karel Vesely; Petr Motlicek; Saarland University +// 2014-2016 Johns Hopkins University (author: Daniel Povey) // See ../../COPYING for clarification regarding multiple authors // @@ -23,17 +24,17 @@ #include #include +#include "feat/feature-common.h" #include "feat/feature-functions.h" +#include "feat/feature-window.h" +#include "feat/mel-computations.h" namespace kaldi { /// @addtogroup feat FeatureExtraction /// @{ -/// MfccOptions contains basic options for computing MFCC features -/// It only includes things that can be done in a "stateless" way, i.e. -/// it does not include energy max-normalization. -/// It does not include delta computation. +/// MfccOptions contains basic options for computing MFCC features. struct MfccOptions { FrameExtractionOptions frame_opts; MelBanksOptions mel_opts; @@ -77,56 +78,70 @@ struct MfccOptions { } }; -class MelBanks; -/// Class for computing MFCC features; see \ref feat_mfcc for more information. -class Mfcc { +// This is the new-style interface to the MFCC computation. +class MfccComputer { public: - explicit Mfcc(const MfccOptions &opts); - ~Mfcc(); + typedef MfccOptions Options; + explicit MfccComputer(const MfccOptions &opts); + MfccComputer(const MfccComputer &other); + + const FrameExtractionOptions &GetFrameOptions() const { + return opts_.frame_opts; + } int32 Dim() const { return opts_.num_ceps; } - /// Will throw exception on failure (e.g. if file too short for even one - /// frame). The output "wave_remainder" is the last frame or two of the - /// waveform that it would be necessary to include in the next call to Compute - /// for the same utterance. It is not exactly the un-processed part (it may - /// have been partly processed), it's the start of the next window that we - /// have not already processed. - void Compute(const VectorBase &wave, + bool NeedRawLogEnergy() { return opts_.use_energy && opts_.raw_energy; } + + /** + Function that computes one frame of features from + one frame of signal. + + @param [in] signal_raw_log_energy The log-energy of the frame of the signal + prior to windowing and pre-emphasis, or + log(numeric_limits::min()), whichever is greater. Must be + ignored by this function if this class returns false from + this->NeedsRawLogEnergy(). + @param [in] vtln_warp The VTLN warping factor that the user wants + to be applied when computing features for this utterance. Will + normally be 1.0, meaning no warping is to be done. The value will + be ignored for feature types that don't support VLTN, such as + spectrogram features. + @param [in] signal_frame One frame of the signal, + as extracted using the function ExtractWindow() using the options + returned by this->GetFrameOptions(). The function will use the + vector as a workspace, which is why it's a non-const pointer. + @param [out] feature Pointer to a vector of size this->Dim(), to which + the computed feature will be written. + */ + void Compute(BaseFloat signal_log_energy, BaseFloat vtln_warp, - Matrix *output, - Vector *wave_remainder = NULL); + VectorBase *signal_frame, + VectorBase *feature); - /// Const version of Compute() - void Compute(const VectorBase &wave, - BaseFloat vtln_warp, - Matrix *output, - Vector *wave_remainder = NULL) const; - - typedef MfccOptions Options; + ~MfccComputer(); private: - void ComputeInternal(const VectorBase &wave, - const MelBanks &mel_banks, - Matrix *output, - Vector *wave_remainder = NULL) const; - + // disallow assignment. + MfccComputer &operator = (const MfccComputer &in); + const MelBanks *GetMelBanks(BaseFloat vtln_warp); - const MelBanks *GetMelBanks(BaseFloat vtln_warp, - bool *must_delete) const; - MfccOptions opts_; Vector lifter_coeffs_; Matrix dct_matrix_; // matrix we left-multiply by to perform DCT. BaseFloat log_energy_floor_; std::map mel_banks_; // BaseFloat is VTLN coefficient. - FeatureWindowFunction feature_window_function_; SplitRadixRealFft *srfft_; - KALDI_DISALLOW_COPY_AND_ASSIGN(Mfcc); + + // note: mel_energies_ is specific to the frame we're processing, it's + // just a temporary workspace. + Vector mel_energies_; }; +typedef OfflineFeatureTpl Mfcc; + /// @} End of "addtogroup feat" } // namespace kaldi diff --git a/src/feat/feature-plp.cc b/src/feat/feature-plp.cc index fe439864346..0034027cbe6 100644 --- a/src/feat/feature-plp.cc +++ b/src/feat/feature-plp.cc @@ -1,6 +1,7 @@ // feat/feature-plp.cc // Copyright 2009-2011 Petr Motlicek; Karel Vesely +// 2016 Johns Hopkins University (author: Daniel Povey) // See ../../COPYING for clarification regarding multiple authors // @@ -19,13 +20,16 @@ #include "feat/feature-plp.h" -#include "util/parse-options.h" - namespace kaldi { -Plp::Plp(const PlpOptions &opts) - : opts_(opts), feature_window_function_(opts.frame_opts), srfft_(NULL) { +PlpComputer::PlpComputer(const PlpOptions &opts): + opts_(opts), srfft_(NULL), + mel_energies_duplicated_(opts_.mel_opts.num_bins + 2, kUndefined), + autocorr_coeffs_(opts_.lpc_order + 1, kUndefined), + lpc_coeffs_(opts_.lpc_order, kUndefined), + raw_cepstrum_(opts_.lpc_order, kUndefined) { + if (opts.cepstral_lifter != 0.0) { lifter_coeffs_.Resize(opts.num_ceps); ComputeLifterCoeffs(opts.cepstral_lifter, &lifter_coeffs_); @@ -41,28 +45,42 @@ Plp::Plp(const PlpOptions &opts) srfft_ = new SplitRadixRealFft(padded_window_size); // We'll definitely need the filterbanks info for VTLN warping factor 1.0. - // [note: this call caches it.] The reason we call this here is to - // improve the efficiency of the "const" version of Compute(). + // [note: this call caches it.] GetMelBanks(1.0); } -Plp::~Plp() { +PlpComputer::PlpComputer(const PlpComputer &other): + opts_(other.opts_), lifter_coeffs_(other.lifter_coeffs_), + idft_bases_(other.idft_bases_), log_energy_floor_(other.log_energy_floor_), + mel_banks_(other.mel_banks_), equal_loudness_(other.equal_loudness_), + srfft_(NULL), + mel_energies_duplicated_(opts_.mel_opts.num_bins + 2, kUndefined), + autocorr_coeffs_(opts_.lpc_order + 1, kUndefined), + lpc_coeffs_(opts_.lpc_order, kUndefined), + raw_cepstrum_(opts_.lpc_order, kUndefined) { for (std::map::iterator iter = mel_banks_.begin(); - iter != mel_banks_.end(); - ++iter) - delete iter->second; + iter != mel_banks_.end(); ++iter) + iter->second = new MelBanks(*(iter->second)); + for (std::map*>::iterator + iter = equal_loudness_.begin(); + iter != equal_loudness_.end(); ++iter) + iter->second = new Vector(*(iter->second)); + if (other.srfft_ != NULL) + srfft_ = new SplitRadixRealFft(*(other.srfft_)); +} - for (std::map* >::iterator iter = equal_loudness_.begin(); - iter != equal_loudness_.end(); - ++iter) +PlpComputer::~PlpComputer() { + for (std::map::iterator iter = mel_banks_.begin(); + iter != mel_banks_.end(); ++iter) delete iter->second; - - if (srfft_ != NULL) - delete srfft_; + for (std::map* >::iterator + iter = equal_loudness_.begin(); + iter != equal_loudness_.end(); ++iter) + delete iter->second; + delete srfft_; } -const MelBanks *Plp::GetMelBanks(BaseFloat vtln_warp) { +const MelBanks *PlpComputer::GetMelBanks(BaseFloat vtln_warp) { MelBanks *this_mel_banks = NULL; std::map::iterator iter = mel_banks_.find(vtln_warp); if (iter == mel_banks_.end()) { @@ -76,23 +94,7 @@ const MelBanks *Plp::GetMelBanks(BaseFloat vtln_warp) { return this_mel_banks; } -const MelBanks *Plp::GetMelBanks(BaseFloat vtln_warp, bool *must_delete) const { - MelBanks *this_mel_banks = NULL; - std::map::const_iterator iter = - mel_banks_.find(vtln_warp); - if (iter == mel_banks_.end()) { - this_mel_banks = new MelBanks(opts_.mel_opts, - opts_.frame_opts, - vtln_warp); - *must_delete = true; - } else { - this_mel_banks = iter->second; - *must_delete = false; - } - return this_mel_banks; -} - -const Vector *Plp::GetEqualLoudness(BaseFloat vtln_warp) { +const Vector *PlpComputer::GetEqualLoudness(BaseFloat vtln_warp) { const MelBanks *this_mel_banks = GetMelBanks(vtln_warp); Vector *ans = NULL; std::map*>::iterator iter @@ -107,160 +109,81 @@ const Vector *Plp::GetEqualLoudness(BaseFloat vtln_warp) { return ans; } +void PlpComputer::Compute(BaseFloat signal_log_energy, + BaseFloat vtln_warp, + VectorBase *signal_frame, + VectorBase *feature) { + KALDI_ASSERT(signal_frame->Dim() == opts_.frame_opts.PaddedWindowSize() && + feature->Dim() == this->Dim()); -const Vector *Plp::GetEqualLoudness(BaseFloat vtln_warp, - const MelBanks &mel_banks, - bool *must_delete) const { - Vector *ans = NULL; - std::map*>::const_iterator iter - = equal_loudness_.find(vtln_warp); - if (iter == equal_loudness_.end()) { - ans = new Vector; - GetEqualLoudnessVector(mel_banks, ans); - *must_delete = true; - } else { - ans = iter->second; - *must_delete = false; - } - return ans; -} + const MelBanks &mel_banks = *GetMelBanks(vtln_warp); + const Vector &equal_loudness = *GetEqualLoudness(vtln_warp); -void Plp::Compute(const VectorBase &wave, - BaseFloat vtln_warp, - Matrix *output, - Vector *wave_remainder) { - const MelBanks *mel_banks = GetMelBanks(vtln_warp); - const Vector *equal_loudness = GetEqualLoudness(vtln_warp); - ComputeInternal(wave, *mel_banks, - *equal_loudness, - output, wave_remainder); -} + KALDI_ASSERT(opts_.num_ceps <= opts_.lpc_order+1); // our num-ceps includes C0. -void Plp::Compute(const VectorBase &wave, - BaseFloat vtln_warp, - Matrix *output, - Vector *wave_remainder) const { - bool must_delete_mel_banks, must_delete_equal_loudness; - const MelBanks *mel_banks = GetMelBanks(vtln_warp, - &must_delete_mel_banks); - const Vector *equal_loudness - = GetEqualLoudness(vtln_warp, *mel_banks, - &must_delete_equal_loudness); - - ComputeInternal(wave, *mel_banks, *equal_loudness, - output, wave_remainder); - - if (must_delete_mel_banks) - delete mel_banks; - if (must_delete_equal_loudness) - delete equal_loudness; -} + if (opts_.use_energy && !opts_.raw_energy) + signal_log_energy = Log(std::max(VecVec(*signal_frame, *signal_frame), + std::numeric_limits::min())); + + if (srfft_ != NULL) // Compute FFT using split-radix algorithm. + srfft_->Compute(signal_frame->Data(), true); + else // An alternative algorithm that works for non-powers-of-two. + RealFft(signal_frame, true); + + // Convert the FFT into a power spectrum. + ComputePowerSpectrum(signal_frame); // elements 0 ... signal_frame->Dim()/2 + + SubVector power_spectrum(*signal_frame, + 0, signal_frame->Dim() / 2 + 1); -void Plp::ComputeInternal(const VectorBase &wave, - const MelBanks &mel_banks, - const Vector &equal_loudness, - Matrix *output, - Vector *wave_remainder) const { - KALDI_ASSERT(output != NULL); - int32 rows_out = NumFrames(wave.Dim(), opts_.frame_opts), - cols_out = opts_.num_ceps; - if (rows_out == 0) { - output->Resize(0, 0); - *wave_remainder = wave; - return; - } - output->Resize(rows_out, cols_out); - if (wave_remainder != NULL) - ExtractWaveformRemainder(wave, opts_.frame_opts, wave_remainder); - Vector window; // windowed waveform. int32 num_mel_bins = opts_.mel_opts.num_bins; - Vector mel_energies(num_mel_bins); - Vector mel_energies_duplicated(num_mel_bins+2); - Vector autocorr_coeffs(opts_.lpc_order+1); - Vector lpc_coeffs(opts_.lpc_order); - Vector raw_cepstrum(opts_.lpc_order); // not including C0, - // and size may differ from final size. - Vector final_cepstrum(opts_.num_ceps); - std::vector temp_buffer; // used by srfft. - - KALDI_ASSERT(opts_.num_ceps <= opts_.lpc_order+1); // our num-ceps includes C0. - for (int32 r = 0; r < rows_out; r++) { // r is frame index.. - BaseFloat log_energy; - ExtractWindow(wave, r, opts_.frame_opts, - feature_window_function_, &window, - (opts_.use_energy && opts_.raw_energy ? &log_energy : NULL)); - - if (opts_.use_energy && !opts_.raw_energy) - log_energy = Log(std::max(VecVec(window, window), - std::numeric_limits::min())); - - if (srfft_ != NULL) // Compute FFT using split-radix algorithm. - srfft_->Compute(window.Data(), true, &temp_buffer); - else // An alternative algorithm that works for non-powers-of-two. - RealFft(&window, true); - - // Convert the FFT into a power spectrum. - ComputePowerSpectrum(&window); // elements 0 ... window.Dim()/2 - - SubVector power_spectrum(window, 0, window.Dim()/2 + 1); - - mel_banks.Compute(power_spectrum, &mel_energies); - - mel_energies.MulElements(equal_loudness); - - mel_energies.ApplyPow(opts_.compress_factor); - - // duplicate first and last elements. - { - SubVector v(mel_energies_duplicated, 1, num_mel_bins); - v.CopyFromVec(mel_energies); - } - mel_energies_duplicated(0) = mel_energies(0); - mel_energies_duplicated(num_mel_bins+1) = mel_energies(num_mel_bins-1); - - autocorr_coeffs.AddMatVec(1.0, idft_bases_, kNoTrans, - mel_energies_duplicated, 0.0); - - BaseFloat energy = ComputeLpc(autocorr_coeffs, &lpc_coeffs); - - energy = std::max(energy, - std::numeric_limits::min()); - - Lpc2Cepstrum(opts_.lpc_order, lpc_coeffs.Data(), raw_cepstrum.Data()); - { - SubVector dst(final_cepstrum, 1, opts_.num_ceps-1); - SubVector src(raw_cepstrum, 0, opts_.num_ceps-1); - dst.CopyFromVec(src); - final_cepstrum(0) = energy; - } - - if (opts_.cepstral_lifter != 0.0) - final_cepstrum.MulElements(lifter_coeffs_); - - if (opts_.cepstral_scale != 1.0) - final_cepstrum.Scale(opts_.cepstral_scale); - - if (opts_.use_energy) { - if (opts_.energy_floor > 0.0 && log_energy < log_energy_floor_) - log_energy = log_energy_floor_; - final_cepstrum(0) = log_energy; - } - - if (opts_.htk_compat) { - BaseFloat energy = final_cepstrum(0); - for (int32 i = 0; i < opts_.num_ceps-1; i++) - final_cepstrum(i) = final_cepstrum(i+1); - // if (!opts_.use_energy) - // energy *= M_SQRT2; // scale on C0 (actually removing scale - // we previously added that's part of one common definition of - // cosine transform.) - final_cepstrum(opts_.num_ceps-1) = energy; - } - - output->Row(r).CopyFromVec(final_cepstrum); - // std::cout << "FIN" << final_cepstrum; + + SubVector mel_energies(mel_energies_duplicated_, 1, num_mel_bins); + + mel_banks.Compute(power_spectrum, &mel_energies); + + mel_energies.MulElements(equal_loudness); + + mel_energies.ApplyPow(opts_.compress_factor); + + // duplicate first and last elements + mel_energies_duplicated_(0) = mel_energies_duplicated_(1); + mel_energies_duplicated_(num_mel_bins + 1) = + mel_energies_duplicated_(num_mel_bins); + + autocorr_coeffs_.SetZero(); // In case of NaNs or infs + autocorr_coeffs_.AddMatVec(1.0, idft_bases_, kNoTrans, + mel_energies_duplicated_, 0.0); + + BaseFloat residual_log_energy = ComputeLpc(autocorr_coeffs_, &lpc_coeffs_); + + residual_log_energy = std::max(residual_log_energy, + std::numeric_limits::min()); + + Lpc2Cepstrum(opts_.lpc_order, lpc_coeffs_.Data(), raw_cepstrum_.Data()); + feature->Range(1, opts_.num_ceps - 1).CopyFromVec( + raw_cepstrum_.Range(0, opts_.num_ceps - 1)); + (*feature)(0) = residual_log_energy; + + if (opts_.cepstral_lifter != 0.0) + feature->MulElements(lifter_coeffs_); + + if (opts_.cepstral_scale != 1.0) + feature->Scale(opts_.cepstral_scale); + + if (opts_.use_energy) { + if (opts_.energy_floor > 0.0 && signal_log_energy < log_energy_floor_) + signal_log_energy = log_energy_floor_; + (*feature)(0) = signal_log_energy; + } + + if (opts_.htk_compat) { // reorder the features. + BaseFloat log_energy = (*feature)(0); + for (int32 i = 0; i < opts_.num_ceps-1; i++) + (*feature)(i) = (*feature)(i+1); + (*feature)(opts_.num_ceps-1) = log_energy; } } diff --git a/src/feat/feature-plp.h b/src/feat/feature-plp.h index bbcbecc21c8..d7deab07ec1 100644 --- a/src/feat/feature-plp.h +++ b/src/feat/feature-plp.h @@ -23,9 +23,11 @@ #include #include +#include "feat/feature-common.h" #include "feat/feature-functions.h" +#include "feat/feature-window.h" +#include "feat/mel-computations.h" #include "itf/options-itf.h" -#include "matrix/kaldi-matrix-inl.h" namespace kaldi { /// @addtogroup feat FeatureExtraction @@ -86,68 +88,84 @@ struct PlpOptions { opts->Register("cepstral-scale", &cepstral_scale, "Scaling constant in PLP computation"); opts->Register("htk-compat", &htk_compat, - "If true, put energy or C0 last and put factor of sqrt(2) on " - "C0. Warning: not sufficient to get HTK compatible features " - "(need to change other parameters)."); + "If true, put energy or C0 last. Warning: not sufficient " + "to get HTK compatible features (need to change other " + "parameters)."); } }; -/// Class for computing PLP features. See \ref feat_plp where -/// documentation will eventually be added. -class Plp { +/// This is the new-style interface to the PLP computation. +class PlpComputer { public: - explicit Plp(const PlpOptions &opts); - ~Plp(); + typedef PlpOptions Options; + explicit PlpComputer(const PlpOptions &opts); + PlpComputer(const PlpComputer &other); + + const FrameExtractionOptions &GetFrameOptions() const { + return opts_.frame_opts; + } int32 Dim() const { return opts_.num_ceps; } - /// Will throw exception on failure (e.g. if file too short for even one - /// frame). The output "wave_remainder" is the last frame or two of the - /// waveform that it would be necessary to include in the next call to Compute - /// for the same utterance. It is not exactly the un-processed part (it may - /// have been partly processed), it's the start of the next window that we - /// have not already processed. Will throw exception on failure (e.g. if file - /// too short for even one frame). - void Compute(const VectorBase &wave, + bool NeedRawLogEnergy() { return opts_.use_energy && opts_.raw_energy; } + + /** + Function that computes one frame of features from + one frame of signal. + + @param [in] signal_raw_log_energy The log-energy of the frame of the signal + prior to windowing and pre-emphasis, or + log(numeric_limits::min()), whichever is greater. Must be + ignored by this function if this class returns false from + this->NeedsRawLogEnergy(). + @param [in] vtln_warp The VTLN warping factor that the user wants + to be applied when computing features for this utterance. Will + normally be 1.0, meaning no warping is to be done. The value will + be ignored for feature types that don't support VLTN, such as + spectrogram features. + @param [in] signal_frame One frame of the signal, + as extracted using the function ExtractWindow() using the options + returned by this->GetFrameOptions(). The function will use the + vector as a workspace, which is why it's a non-const pointer. + @param [out] feature Pointer to a vector of size this->Dim(), to which + the computed feature will be written. + */ + void Compute(BaseFloat signal_log_energy, BaseFloat vtln_warp, - Matrix *output, - Vector *wave_remainder = NULL); + VectorBase *signal_frame, + VectorBase *feature); - typedef PlpOptions Options; - /// Const version of Compute() - void Compute(const VectorBase &wave, - BaseFloat vtln_warp, - Matrix *output, - Vector *wave_remainder = NULL) const; + ~PlpComputer(); private: - void ComputeInternal(const VectorBase &wave, - const MelBanks &mel_banks, - const Vector &equal_loudness, - Matrix *output, - Vector *wave_remainder = NULL) const; const MelBanks *GetMelBanks(BaseFloat vtln_warp); - const MelBanks *GetMelBanks(BaseFloat vtln_warp, bool *must_delete) const; - const Vector *GetEqualLoudness(BaseFloat vtln_warp); - const Vector *GetEqualLoudness(BaseFloat vtln_warp, - const MelBanks &mel_banks, - bool *must_delete) const; - PlpOptions opts_; Vector lifter_coeffs_; Matrix idft_bases_; BaseFloat log_energy_floor_; std::map mel_banks_; // BaseFloat is VTLN coefficient. std::map* > equal_loudness_; - FeatureWindowFunction feature_window_function_; SplitRadixRealFft *srfft_; - KALDI_DISALLOW_COPY_AND_ASSIGN(Plp); + + // temporary vector used inside Compute; size is opts_.mel_opts.num_bins + 2 + Vector mel_energies_duplicated_; + // temporary vector used inside Compute; size is opts_.lpc_order + 1 + Vector autocorr_coeffs_; + // temporary vector used inside Compute; size is opts_.lpc_order + Vector lpc_coeffs_; + // temporary vector used inside Compute; size is opts_.lpc_order + Vector raw_cepstrum_; + + // Disallow assignment. + PlpComputer &operator =(const PlpComputer &other); }; +typedef OfflineFeatureTpl Plp; + /// @} End of "addtogroup feat" } // namespace kaldi diff --git a/src/feat/feature-spectrogram.cc b/src/feat/feature-spectrogram.cc index df915ad90fe..953f38fc54f 100644 --- a/src/feat/feature-spectrogram.cc +++ b/src/feat/feature-spectrogram.cc @@ -24,8 +24,8 @@ namespace kaldi { -Spectrogram::Spectrogram(const SpectrogramOptions &opts) - : opts_(opts), feature_window_function_(opts.frame_opts), srfft_(NULL) { +SpectrogramComputer::SpectrogramComputer(const SpectrogramOptions &opts) + : opts_(opts), srfft_(NULL) { if (opts.energy_floor > 0.0) log_energy_floor_ = Log(opts.energy_floor); @@ -34,63 +34,49 @@ Spectrogram::Spectrogram(const SpectrogramOptions &opts) srfft_ = new SplitRadixRealFft(padded_window_size); } -Spectrogram::~Spectrogram() { - if (srfft_ != NULL) - delete srfft_; +SpectrogramComputer::SpectrogramComputer(const SpectrogramComputer &other): + opts_(other.opts_), log_energy_floor_(other.log_energy_floor_), srfft_(NULL) { + if (other.srfft_ != NULL) + srfft_ = new SplitRadixRealFft(*other.srfft_); } -void Spectrogram::Compute(const VectorBase &wave, - Matrix *output, - Vector *wave_remainder) { - KALDI_ASSERT(output != NULL); - - // Get dimensions of output features - int32 rows_out = NumFrames(wave.Dim(), opts_.frame_opts); - int32 cols_out = opts_.frame_opts.PaddedWindowSize()/2 +1; - if (rows_out == 0) - KALDI_ERR << "No frames fit in file (#samples is " << wave.Dim() << ")"; - // Prepare the output buffer - output->Resize(rows_out, cols_out); - - // Optionally extract the remainder for further processing - if (wave_remainder != NULL) - ExtractWaveformRemainder(wave, opts_.frame_opts, wave_remainder); - - // Buffers - Vector window; // windowed waveform. - BaseFloat log_energy; - - // Compute all the freames, r is frame index.. - for (int32 r = 0; r < rows_out; r++) { - // Cut the window, apply window function - ExtractWindow(wave, r, opts_.frame_opts, feature_window_function_, - &window, (opts_.raw_energy ? &log_energy : NULL)); - - // Compute energy after window function (not the raw one) - if (!opts_.raw_energy) - log_energy = Log(std::max(VecVec(window, window), - std::numeric_limits::min())); - - if (srfft_ != NULL) // Compute FFT using split-radix algorithm. - srfft_->Compute(window.Data(), true); - else // An alternative algorithm that works for non-powers-of-two - RealFft(&window, true); - - // Convert the FFT into a power spectrum. - ComputePowerSpectrum(&window); - SubVector power_spectrum(window, 0, window.Dim()/2 + 1); - - power_spectrum.ApplyFloor(std::numeric_limits::min()); - power_spectrum.ApplyLog(); - - // Output buffers - SubVector this_output(output->Row(r)); - this_output.CopyFromVec(power_spectrum); - if (opts_.energy_floor > 0.0 && log_energy < log_energy_floor_) { - log_energy = log_energy_floor_; - } - this_output(0) = log_energy; - } +SpectrogramComputer::~SpectrogramComputer() { + delete srfft_; +} + +void SpectrogramComputer::Compute(BaseFloat signal_log_energy, + BaseFloat vtln_warp, + VectorBase *signal_frame, + VectorBase *feature) { + KALDI_ASSERT(signal_frame->Dim() == opts_.frame_opts.PaddedWindowSize() && + feature->Dim() == this->Dim()); + + + // Compute energy after window function (not the raw one) + if (!opts_.raw_energy) + signal_log_energy = Log(std::max(VecVec(*signal_frame, *signal_frame), + std::numeric_limits::epsilon())); + + if (srfft_ != NULL) // Compute FFT using split-radix algorithm. + srfft_->Compute(signal_frame->Data(), true); + else // An alternative algorithm that works for non-powers-of-two + RealFft(signal_frame, true); + + // Convert the FFT into a power spectrum. + ComputePowerSpectrum(signal_frame); + SubVector power_spectrum(*signal_frame, + 0, signal_frame->Dim() / 2 + 1); + + power_spectrum.ApplyFloor(std::numeric_limits::epsilon()); + power_spectrum.ApplyLog(); + + feature->CopyFromVec(power_spectrum); + + if (opts_.energy_floor > 0.0 && signal_log_energy < log_energy_floor_) + signal_log_energy = log_energy_floor_; + // The zeroth spectrogram component is always set to the signal energy, + // instead of the square of the constant component of the signal. + (*feature)(0) = signal_log_energy; } } // namespace kaldi diff --git a/src/feat/feature-spectrogram.h b/src/feat/feature-spectrogram.h index 500e3f4a588..ec318556f24 100644 --- a/src/feat/feature-spectrogram.h +++ b/src/feat/feature-spectrogram.h @@ -24,17 +24,17 @@ #include +#include "feat/feature-common.h" #include "feat/feature-functions.h" +#include "feat/feature-window.h" namespace kaldi { /// @addtogroup feat FeatureExtraction /// @{ -/// SpectrogramOptions contains basic options for computing SPECTROGRAM features -/// It only includes things that can be done in a "stateless" way, i.e. -/// it does not include energy max-normalization. -/// It does not include delta computation. +/// SpectrogramOptions contains basic options for computing spectrogram +/// features. struct SpectrogramOptions { FrameExtractionOptions frame_opts; BaseFloat energy_floor; @@ -53,26 +53,58 @@ struct SpectrogramOptions { } }; -/// Class for computing SPECTROGRAM features; see \ref feat_mfcc for more information. -class Spectrogram { +/// Class for computing spectrogram features. +class SpectrogramComputer { public: - explicit Spectrogram(const SpectrogramOptions &opts); - ~Spectrogram(); + typedef SpectrogramOptions Options; + explicit SpectrogramComputer(const SpectrogramOptions &opts); + SpectrogramComputer(const SpectrogramComputer &other); - /// Will throw exception on failure (e.g. if file too short for - /// even one frame). - void Compute(const VectorBase &wave, - Matrix *output, - Vector *wave_remainder = NULL); + const FrameExtractionOptions& GetFrameOptions() const { + return opts_.frame_opts; + } + + int32 Dim() const { return opts_.frame_opts.PaddedWindowSize() / 2 + 1; } + + bool NeedRawLogEnergy() { return opts_.raw_energy; } + + + /** + Function that computes one frame of spectrogram features from + one frame of signal. + + @param [in] signal_raw_log_energy The log-energy of the frame of the signal + prior to windowing and pre-emphasis, or + log(numeric_limits::min()), whichever is greater. Must be + ignored by this function if this class returns false from + this->NeedsRawLogEnergy(). + @param [in] vtln_warp This is ignored by this function, it's only + needed for interface compatibility. + @param [in] signal_frame One frame of the signal, + as extracted using the function ExtractWindow() using the options + returned by this->GetFrameOptions(). The function will use the + vector as a workspace, which is why it's a non-const pointer. + @param [out] feature Pointer to a vector of size this->Dim(), to which + the computed feature will be written. + */ + void Compute(BaseFloat signal_log_energy, + BaseFloat vtln_warp, + VectorBase *signal_frame, + VectorBase *feature); + + ~SpectrogramComputer(); private: SpectrogramOptions opts_; BaseFloat log_energy_floor_; - FeatureWindowFunction feature_window_function_; SplitRadixRealFft *srfft_; - KALDI_DISALLOW_COPY_AND_ASSIGN(Spectrogram); + + // Disallow assignment. + SpectrogramComputer &operator=(const SpectrogramComputer &other); }; +typedef OfflineFeatureTpl Spectrogram; + /// @} End of "addtogroup feat" } // namespace kaldi diff --git a/src/feat/feature-window.cc b/src/feat/feature-window.cc new file mode 100644 index 00000000000..f084f5c0170 --- /dev/null +++ b/src/feat/feature-window.cc @@ -0,0 +1,229 @@ +// feat/feature-window.cc + +// Copyright 2009-2011 Karel Vesely; Petr Motlicek; Microsoft Corporation +// 2013-2016 Johns Hopkins University (author: Daniel Povey) +// 2014 IMSL, PKU-HKUST (author: Wei Shi) + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + + +#include "feat/feature-window.h" +#include "matrix/matrix-functions.h" + + +namespace kaldi { + + +int64 FirstSampleOfFrame(int32 frame, + const FrameExtractionOptions &opts) { + int64 frame_shift = opts.WindowShift(); + if (opts.snip_edges) { + return frame * frame_shift; + } else { + int64 midpoint_of_frame = frame_shift * frame + frame_shift / 2, + beginning_of_frame = midpoint_of_frame - opts.WindowSize() / 2; + return beginning_of_frame; + } +} + +int32 NumFrames(int64 num_samples, + const FrameExtractionOptions &opts, + bool flush) { + int64 frame_shift = opts.WindowShift(); + int64 frame_length = opts.WindowSize(); + if (opts.snip_edges) { + // with --snip-edges=true (the default), we use a HTK-like approach to + // determining the number of frames-- all frames have to fit completely into + // the waveform, and the first frame begins at sample zero. + if (num_samples < frame_length) + return 0; + else + return (1 + ((num_samples - frame_length) / frame_shift)); + // You can understand the expression above as follows: 'num_samples - + // frame_length' is how much room we have to shift the frame within the + // waveform; 'frame_shift' is how much we shift it each time; and the ratio + // is how many times we can shift it (integer arithmetic rounds down). + } else { + // if --snip-edges=false, the number of frames is determined by rounding the + // (file-length / frame-shift) to the nearest integer. The point of this + // formula is to make the number of frames an obvious and predictable + // function of the frame shift and signal length, which makes many + // segmentation-related questions simpler. + // + // Because integer division in C++ rounds toward zero, we add (half the + // frame-shift minus epsilon) before dividing, to have the effect of + // rounding towards the closest integer. + int32 num_frames = (num_samples + (frame_shift / 2)) / frame_shift; + + if (flush) + return num_frames; + + // note: 'end' always means the last plus one, i.e. one past the last. + int64 end_sample_of_last_frame = FirstSampleOfFrame(num_frames - 1, opts) + + frame_length; + + // the following code is optimized more for clarity than efficiency. + // If flush == false, we can't output frames that extend past the end + // of the signal. + while (num_frames > 0 && end_sample_of_last_frame > num_samples) { + num_frames--; + end_sample_of_last_frame -= frame_shift; + } + return num_frames; + } +} + + +void Dither(VectorBase *waveform, BaseFloat dither_value) { + for (int32 i = 0; i < waveform->Dim(); i++) + (*waveform)(i) += RandGauss() * dither_value; +} + + +void Preemphasize(VectorBase *waveform, BaseFloat preemph_coeff) { + if (preemph_coeff == 0.0) return; + KALDI_ASSERT(preemph_coeff >= 0.0 && preemph_coeff <= 1.0); + for (int32 i = waveform->Dim()-1; i > 0; i--) + (*waveform)(i) -= preemph_coeff * (*waveform)(i-1); + (*waveform)(0) -= preemph_coeff * (*waveform)(0); +} + +FeatureWindowFunction::FeatureWindowFunction(const FrameExtractionOptions &opts) { + int32 frame_length = opts.WindowSize(); + KALDI_ASSERT(frame_length > 0); + window.Resize(frame_length); + for (int32 i = 0; i < frame_length; i++) { + BaseFloat i_fl = static_cast(i); + if (opts.window_type == "hanning") { + window(i) = 0.5 - 0.5*cos(M_2PI * i_fl / (frame_length-1)); + } else if (opts.window_type == "hamming") { + window(i) = 0.54 - 0.46*cos(M_2PI * i_fl / (frame_length-1)); + } else if (opts.window_type == "povey") { // like hamming but goes to zero at edges. + window(i) = pow(0.5 - 0.5*cos(M_2PI * i_fl / (frame_length-1)), 0.85); + } else if (opts.window_type == "rectangular") { + window(i) = 1.0; + } else { + KALDI_ERR << "Invalid window type " << opts.window_type; + } + } +} + +void ProcessWindow(const FrameExtractionOptions &opts, + const FeatureWindowFunction &window_function, + VectorBase *window, + BaseFloat *log_energy_pre_window) { + int32 frame_length = opts.WindowSize(); + KALDI_ASSERT(window->Dim() == frame_length); + + if (opts.dither != 0.0) + Dither(window, opts.dither); + + if (opts.remove_dc_offset) + window->Add(-window->Sum() / frame_length); + + if (log_energy_pre_window != NULL) { + BaseFloat energy = std::max(VecVec(*window, *window), + std::numeric_limits::epsilon()); + *log_energy_pre_window = Log(energy); + } + + if (opts.preemph_coeff != 0.0) + Preemphasize(window, opts.preemph_coeff); + + window->MulElements(window_function.window); +} + + +// ExtractWindow extracts a windowed frame of waveform with a power-of-two, +// padded size. It does mean subtraction, pre-emphasis and dithering as +// requested. +void ExtractWindow(int64 sample_offset, + const VectorBase &wave, + int32 f, // with 0 <= f < NumFrames(feats, opts) + const FrameExtractionOptions &opts, + const FeatureWindowFunction &window_function, + Vector *window, + BaseFloat *log_energy_pre_window) { + KALDI_ASSERT(sample_offset >= 0 && wave.Dim() != 0); + int32 frame_length = opts.WindowSize(), + frame_length_padded = opts.PaddedWindowSize(); + int64 num_samples = sample_offset + wave.Dim(), + start_sample = FirstSampleOfFrame(f, opts), + end_sample = start_sample + frame_length; + + if (opts.snip_edges) { + KALDI_ASSERT(start_sample >= sample_offset && + end_sample <= num_samples); + } else { + KALDI_ASSERT(sample_offset == 0 || start_sample >= sample_offset); + } + + if (window->Dim() != frame_length_padded) + window->Resize(frame_length_padded, kUndefined); + + // wave_start and wave_end are start and end indexes into 'wave', for the + // piece of wave that we're trying to extract. + int32 wave_start = int32(start_sample - sample_offset), + wave_end = wave_start + frame_length; + if (wave_start >= 0 && wave_end <= wave.Dim()) { + // the normal case-- no edge effects to consider. + window->Range(0, frame_length).CopyFromVec( + wave.Range(wave_start, frame_length)); + } else { + // Deal with any end effects by reflection, if needed. This code will only + // be reached for about two frames per utterance, so we don't concern + // ourselves excessively with efficiency. + int32 wave_dim = wave.Dim(); + for (int32 s = 0; s < frame_length; s++) { + int32 s_in_wave = s + wave_start; + while (s_in_wave < 0 || s_in_wave >= wave_dim) { + // reflect around the beginning or end of the wave. + // e.g. -1 -> 0, -2 -> 1. + // dim -> dim - 1, dim + 1 -> dim - 2. + // the code supports repeated reflections, although this + // would only be needed in pathological cases. + if (s_in_wave < 0) s_in_wave = - s_in_wave - 1; + else s_in_wave = 2 * wave_dim - 1 - s_in_wave; + } + (*window)(s) = wave(s_in_wave); + } + } + + if (frame_length_padded > frame_length) + window->Range(frame_length, frame_length_padded - frame_length).SetZero(); + + SubVector frame(*window, 0, frame_length); + + ProcessWindow(opts, window_function, &frame, log_energy_pre_window); +} + +void ExtractWaveformRemainder(const VectorBase &wave, + const FrameExtractionOptions &opts, + Vector *wave_remainder) { + int32 frame_shift = opts.WindowShift(); + int32 num_frames = NumFrames(wave.Dim(), opts); + // offset is the amount at the start that has been extracted. + int32 offset = num_frames * frame_shift; + KALDI_ASSERT(wave_remainder != NULL); + int32 remaining_len = wave.Dim() - offset; + wave_remainder->Resize(remaining_len); + KALDI_ASSERT(remaining_len >= 0); + if (remaining_len > 0) + wave_remainder->CopyFromVec(SubVector(wave, offset, remaining_len)); +} + + +} // namespace kaldi diff --git a/src/feat/feature-window.h b/src/feat/feature-window.h new file mode 100644 index 00000000000..748a8f91d10 --- /dev/null +++ b/src/feat/feature-window.h @@ -0,0 +1,207 @@ +// feat/feature-window.h + +// Copyright 2009-2011 Karel Vesely; Petr Motlicek; Saarland University +// 2014-2016 Johns Hopkins University (author: Daniel Povey) + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + +#ifndef KALDI_FEAT_FEATURE_WINDOW_H_ +#define KALDI_FEAT_FEATURE_WINDOW_H_ + +#include +#include + +#include "matrix/matrix-lib.h" +#include "util/common-utils.h" +#include "base/kaldi-error.h" + +namespace kaldi { +/// @addtogroup feat FeatureExtraction +/// @{ + +struct FrameExtractionOptions { + BaseFloat samp_freq; + BaseFloat frame_shift_ms; // in milliseconds. + BaseFloat frame_length_ms; // in milliseconds. + BaseFloat dither; // Amount of dithering, 0.0 means no dither. + BaseFloat preemph_coeff; // Preemphasis coefficient. + bool remove_dc_offset; // Subtract mean of wave before FFT. + std::string window_type; // e.g. Hamming window + bool round_to_power_of_two; + bool snip_edges; + // Maybe "hamming", "rectangular", "povey", "hanning" + // "povey" is a window I made to be similar to Hamming but to go to zero at the + // edges, it's pow((0.5 - 0.5*cos(n/N*2*pi)), 0.85) + // I just don't think the Hamming window makes sense as a windowing function. + FrameExtractionOptions(): + samp_freq(16000), + frame_shift_ms(10.0), + frame_length_ms(25.0), + dither(1.0), + preemph_coeff(0.97), + remove_dc_offset(true), + window_type("povey"), + round_to_power_of_two(true), + snip_edges(true){ } + + void Register(OptionsItf *opts) { + opts->Register("sample-frequency", &samp_freq, + "Waveform data sample frequency (must match the waveform file, " + "if specified there)"); + opts->Register("frame-length", &frame_length_ms, "Frame length in milliseconds"); + opts->Register("frame-shift", &frame_shift_ms, "Frame shift in milliseconds"); + opts->Register("preemphasis-coefficient", &preemph_coeff, + "Coefficient for use in signal preemphasis"); + opts->Register("remove-dc-offset", &remove_dc_offset, + "Subtract mean from waveform on each frame"); + opts->Register("dither", &dither, "Dithering constant (0.0 means no dither)"); + opts->Register("window-type", &window_type, "Type of window " + "(\"hamming\"|\"hanning\"|\"povey\"|\"rectangular\")"); + opts->Register("round-to-power-of-two", &round_to_power_of_two, + "If true, round window size to power of two."); + opts->Register("snip-edges", &snip_edges, + "If true, end effects will be handled by outputting only frames that " + "completely fit in the file, and the number of frames depends on the " + "frame-length. If false, the number of frames depends only on the " + "frame-shift, and we reflect the data at the ends."); + } + int32 WindowShift() const { + return static_cast(samp_freq * 0.001 * frame_shift_ms); + } + int32 WindowSize() const { + return static_cast(samp_freq * 0.001 * frame_length_ms); + } + int32 PaddedWindowSize() const { + return (round_to_power_of_two ? RoundUpToNearestPowerOfTwo(WindowSize()) : + WindowSize()); + } +}; + + +struct FeatureWindowFunction { + FeatureWindowFunction() {} + explicit FeatureWindowFunction(const FrameExtractionOptions &opts); + FeatureWindowFunction(const FeatureWindowFunction &other): + window(other.window) { } + Vector window; +}; + + +/** + This function returns the number of frames that we can extract from a wave + file with the given number of samples in it (assumed to have the same + sampling rate as specified in 'opts'). + + @param [in] wave_length The number of samples in the wave file. + @param [in] opts The frame-extraction options class + + @param [in] flush True if we are asserting that this number of samples is + 'all there is', false if we expecting more data to possibly come + in. This only makes a difference to the answer if opts.snips_edges + == false. For offline feature extraction you always want flush == + true. In an online-decoding context, once you know (or decide) that + no more data is coming in, you'd call it with flush == true at the + end to flush out any remaining data. +*/ +int32 NumFrames(int64 num_samples, + const FrameExtractionOptions &opts, + bool flush = true); + +/* + This function returns the index of the first sample of the frame indexed + 'frame'. If snip-edges=true, it just returns frame * opts.WindowShift(); if + snip-edges=false, the formula is a little more complicated and the result may + be negative. +*/ +int64 FirstSampleOfFrame(int32 frame, + const FrameExtractionOptions &opts); + + + +void Dither(VectorBase *waveform, BaseFloat dither_value); + +void Preemphasize(VectorBase *waveform, BaseFloat preemph_coeff); + +/** + This function does all the windowing steps after actually + extracting the windowed signal: depeding on the + configuration, it does dithering, dc offset removal, + preemphasis, and multiplication by the windowing function. + @param [in] opts The options class to be used + @param [in] window_function The windowing function-- should have + been initialized using 'opts'. + @param [in,out] window A vector of size opts.WindowSize(). Note: + it will typically be a sub-vector of a larger vector of size + opts.PaddedWindowSize(), with the remaining samples zero, + as the FFT code is more efficient if it operates on data with + power-of-two size. + @param [out] log_energy_pre_window If non-NULL, then after dithering and + DC offset removal, this function will write to this pointer the log of + the total energy (i.e. sum-squared) of the frame. + */ +void ProcessWindow(const FrameExtractionOptions &opts, + const FeatureWindowFunction &window_function, + VectorBase *window, + BaseFloat *log_energy_pre_window = NULL); + + +/* + ExtractWindow() extracts a windowed frame of waveform (possibly with a + power-of-two, padded size, depending on the config), including all the + proessing done by ProcessWindow(). + + @param [in] sample_offset If 'wave' is not the entire waveform, but + part of it to the left has been discarded, then the + number of samples prior to 'wave' that we have + already discarded. Set this to zero if you are + processing the entire waveform in one piece, or + if you get 'no matching function' compilation + errors when updating the code. + @param [in] wave The waveform + @param [in] f The frame index to be extracted, with + 0 <= f < NumFrames(sample_offset + wave.Dim(), opts, true) + @param [in] opts The options class to be used + @param [in] window_function The windowing function, as derived from the + options class. + @param [out] window The windowed, possibly-padded waveform to be + extracted. Will be resized as needed. + @param [out] log_energy_pre_window If non-NULL, the log-energy of + the signal prior to pre-emphasis and multiplying by + the windowing function will be written to here. +*/ +void ExtractWindow(int64 sample_offset, + const VectorBase &wave, + int32 f, + const FrameExtractionOptions &opts, + const FeatureWindowFunction &window_function, + Vector *window, + BaseFloat *log_energy_pre_window = NULL); + + +// ExtractWaveformRemainder is useful if the waveform is coming in segments. +// It extracts the bit of the waveform at the end of this block that you +// would have to append the next bit of waveform to, if you wanted to have +// the same effect as everything being in one big block. +void ExtractWaveformRemainder(const VectorBase &wave, + const FrameExtractionOptions &opts, + Vector *wave_remainder); + + +/// @} End of "addtogroup feat" +} // namespace kaldi + + +#endif // KALDI_FEAT_FEATURE_WINDOW_H_ diff --git a/src/feat/mel-computations.cc b/src/feat/mel-computations.cc index 9949a468d4c..714d963f01b 100644 --- a/src/feat/mel-computations.cc +++ b/src/feat/mel-computations.cc @@ -23,8 +23,9 @@ #include #include -#include "feat/mel-computations.h" #include "feat/feature-functions.h" +#include "feat/feature-window.h" +#include "feat/mel-computations.h" namespace kaldi { @@ -57,7 +58,7 @@ MelBanks::MelBanks(const MelBanksOptions &opts, KALDI_ERR << "Bad values in options: low-freq " << low_freq << " and high-freq " << high_freq << " vs. nyquist " << nyquist; - + BaseFloat fft_bin_width = sample_freq / window_length_padded; // fft-bin width [think of it as Nyquist-freq / half-window-length] @@ -73,7 +74,7 @@ MelBanks::MelBanks(const MelBanksOptions &opts, BaseFloat vtln_low = opts.vtln_low, vtln_high = opts.vtln_high; if (vtln_high < 0.0) vtln_high += nyquist; - + if (vtln_warp_factor != 1.0 && (vtln_low < 0.0 || vtln_low <= low_freq || vtln_low >= high_freq @@ -106,7 +107,8 @@ MelBanks::MelBanks(const MelBanksOptions &opts, Vector this_bin(num_fft_bins); int32 first_index = -1, last_index = -1; for (int32 i = 0; i < num_fft_bins; i++) { - BaseFloat freq = (fft_bin_width * i); // center freq of this fft bin. + BaseFloat freq = (fft_bin_width * i); // Center frequency of this fft + // bin. BaseFloat mel = MelScale(freq); if (mel > left_mel && mel < right_mel) { BaseFloat weight; @@ -122,7 +124,7 @@ MelBanks::MelBanks(const MelBanksOptions &opts, } KALDI_ASSERT(first_index != -1 && last_index >= first_index && "You may have set --num-mel-bins too large."); - + bins_[bin].first = first_index; int32 size = last_index + 1 - first_index; bins_[bin].second.Resize(size); @@ -131,7 +133,7 @@ MelBanks::MelBanks(const MelBanksOptions &opts, // Replicate a bug in HTK, for testing purposes. if (opts.htk_mode && bin == 0 && mel_low_freq != 0.0) bins_[bin].second(0) = 0.0; - + } if (debug_) { for (size_t i = 0; i < bins_.size(); i++) { @@ -141,6 +143,12 @@ MelBanks::MelBanks(const MelBanksOptions &opts, } } +MelBanks::MelBanks(const MelBanks &other): + center_freqs_(other.center_freqs_), + bins_(other.bins_), + debug_(other.debug_), + htk_mode_(other.htk_mode_) { } + BaseFloat MelBanks::VtlnWarpFreq(BaseFloat vtln_low_cutoff, // upper+lower frequency cutoffs for VTLN. BaseFloat vtln_high_cutoff, BaseFloat low_freq, // upper+lower frequency cutoffs in mel computation @@ -218,19 +226,18 @@ BaseFloat MelBanks::VtlnWarpMelFreq(BaseFloat vtln_low_cutoff, // upper+lower f // "power_spectrum" contains fft energies. void MelBanks::Compute(const VectorBase &power_spectrum, - Vector *mel_energies_out) const { + VectorBase *mel_energies_out) const { int32 num_bins = bins_.size(); - if (mel_energies_out->Dim() != num_bins) - mel_energies_out->Resize(num_bins); + KALDI_ASSERT(mel_energies_out->Dim() == num_bins); for (int32 i = 0; i < num_bins; i++) { int32 offset = bins_[i].first; const Vector &v(bins_[i].second); BaseFloat energy = VecVec(v, power_spectrum.Range(offset, v.Dim())); // HTK-like flooring- for testing purposes (we prefer dither) - if (htk_mode_ && energy < 1.0) energy = 1.0; + if (htk_mode_ && energy < 1.0) energy = 1.0; (*mel_energies_out)(i) = energy; - + // The following assert was added due to a problem with OpenBlas that // we had at one point (it was a bug in that library). Just to detect // it early. @@ -303,5 +310,33 @@ void Lpc2Cepstrum(int n, const BaseFloat *pLPC, BaseFloat *pCepst) { } } +void GetEqualLoudnessVector(const MelBanks &mel_banks, + Vector *ans) { + int32 n = mel_banks.NumBins(); + // Central frequency of each mel bin. + const Vector &f0 = mel_banks.GetCenterFreqs(); + ans->Resize(n); + for (int32 i = 0; i < n; i++) { + BaseFloat fsq = f0(i) * f0(i); + BaseFloat fsub = fsq / (fsq + 1.6e5); + (*ans)(i) = fsub * fsub * ((fsq + 1.44e6) / (fsq + 9.61e6)); + } +} + + +// Compute LP coefficients from autocorrelation coefficients. +BaseFloat ComputeLpc(const VectorBase &autocorr_in, + Vector *lpc_out) { + int32 n = autocorr_in.Dim() - 1; + KALDI_ASSERT(lpc_out->Dim() == n); + Vector tmp(n); + BaseFloat ans = Durbin(n, autocorr_in.Data(), + lpc_out->Data(), + tmp.Data()); + if (ans <= 0.0) + KALDI_WARN << "Zero energy in LPC computation"; + return -Log(1.0 / ans); // forms the C0 value +} + } // namespace kaldi diff --git a/src/feat/mel-computations.h b/src/feat/mel-computations.h index fbc9f532cd0..5df36c8cb90 100644 --- a/src/feat/mel-computations.h +++ b/src/feat/mel-computations.h @@ -1,6 +1,7 @@ // feat/mel-computations.h // Copyright 2009-2011 Phonexia s.r.o.; Microsoft Corporation +// 2016 Johns Hopkins University (author: Daniel Povey) // See ../../COPYING for clarification regarding multiple authors // @@ -36,9 +37,43 @@ namespace kaldi { /// @addtogroup feat FeatureExtraction /// @{ -struct FrameExtractionOptions; // defined in feature-function.h +struct FrameExtractionOptions; // defined in feature-window.h + + +struct MelBanksOptions { + int32 num_bins; // e.g. 25; number of triangular bins + BaseFloat low_freq; // e.g. 20; lower frequency cutoff + BaseFloat high_freq; // an upper frequency cutoff; 0 -> no cutoff, negative + // ->added to the Nyquist frequency to get the cutoff. + BaseFloat vtln_low; // vtln lower cutoff of warping function. + BaseFloat vtln_high; // vtln upper cutoff of warping function: if negative, added + // to the Nyquist frequency to get the cutoff. + bool debug_mel; + // htk_mode is a "hidden" config, it does not show up on command line. + // Enables more exact compatibibility with HTK, for testing purposes. Affects + // mel-energy flooring and reproduces a bug in HTK. + bool htk_mode; + explicit MelBanksOptions(int num_bins = 25) + : num_bins(num_bins), low_freq(20), high_freq(0), vtln_low(100), + vtln_high(-500), debug_mel(false), htk_mode(false) {} + + void Register(OptionsItf *opts) { + opts->Register("num-mel-bins", &num_bins, + "Number of triangular mel-frequency bins"); + opts->Register("low-freq", &low_freq, + "Low cutoff frequency for mel bins"); + opts->Register("high-freq", &high_freq, + "High cutoff frequency for mel bins (if < 0, offset from Nyquist)"); + opts->Register("vtln-low", &vtln_low, + "Low inflection point in piecewise linear VTLN warping function"); + opts->Register("vtln-high", &vtln_high, + "High inflection point in piecewise linear VTLN warping function" + " (if negative, offset from high-mel-freq"); + opts->Register("debug-mel", &debug_mel, + "Print out debugging information for mel bin computation"); + } +}; -struct MelBanksOptions; // defined in feature-function.h class MelBanks { public: @@ -74,14 +109,19 @@ class MelBanks { /// Compute Mel energies (note: not log enerties). /// At input, "fft_energies" contains the FFT energies (not log). void Compute(const VectorBase &fft_energies, - Vector *mel_energies_out) const; + VectorBase *mel_energies_out) const; int32 NumBins() const { return bins_.size(); } // returns vector of central freq of each bin; needed by plp code. const Vector &GetCenterFreqs() const { return center_freqs_; } + // Copy constructor + MelBanks(const MelBanks &other); private: + // Disallow assignment + MelBanks &operator = (const MelBanks &other); + // center frequencies of bins, numbered from 0 ... num_bins-1. // Needed by GetCenterFreqs(). Vector center_freqs_; @@ -92,7 +132,6 @@ class MelBanks { bool debug_; bool htk_mode_; - KALDI_DISALLOW_COPY_AND_ASSIGN(MelBanks); }; @@ -107,10 +146,21 @@ void ComputeLifterCoeffs(BaseFloat Q, VectorBase *coeffs); // pAC - autocorrelation coefficients [n + 1] // pLP - linear prediction coefficients [n] (predicted_sn = sum_1^P{a[i] * s[n-i]}}) // F(z) = 1 / (1 - A(z)), 1 is not stored in the demoninator +// Returns log energy of residual (I think) BaseFloat Durbin(int n, const BaseFloat *pAC, BaseFloat *pLP, BaseFloat *pTmp); +// Compute LP coefficients from autocorrelation coefficients. +// Returns log energy of residual (I think) +BaseFloat ComputeLpc(const VectorBase &autocorr_in, + Vector *lpc_out); + void Lpc2Cepstrum(int n, const BaseFloat *pLPC, BaseFloat *pCepst); + + +void GetEqualLoudnessVector(const MelBanks &mel_banks, + Vector *ans); + /// @} End of "addtogroup feat" } // namespace kaldi diff --git a/src/feat/online-feature-test.cc b/src/feat/online-feature-test.cc index 7a46e837151..556160f8e53 100644 --- a/src/feat/online-feature-test.cc +++ b/src/feat/online-feature-test.cc @@ -172,6 +172,8 @@ void TestOnlineMfcc() { op.mel_opts.low_freq = 0.0; op.htk_compat = false; op.use_energy = false; // C0 not energy. + if (RandInt(0, 1) == 0) + op.frame_opts.snip_edges = false; Mfcc mfcc(op); // compute mfcc offline diff --git a/src/feat/online-feature.cc b/src/feat/online-feature.cc index b9d74d3a293..267a4724580 100644 --- a/src/feat/online-feature.cc +++ b/src/feat/online-feature.cc @@ -24,82 +24,91 @@ namespace kaldi { - template void OnlineGenericBaseFeature::GetFrame(int32 frame, VectorBase *feat) { - KALDI_ASSERT(frame >= 0 && frame < num_frames_); - KALDI_ASSERT(feat->Dim() == Dim()); - feat->CopyFromVec(features_.Row(frame)); + // 'at' does size checking. + feat->CopyFromVec(*(features_.at(frame))); }; -template -bool OnlineGenericBaseFeature::IsLastFrame(int32 frame) const { - return (frame == num_frames_ - 1 && input_finished_); -} - template OnlineGenericBaseFeature::OnlineGenericBaseFeature( - const typename C::Options &opts) - :mfcc_or_plp_(opts), input_finished_(false), num_frames_(0), - sampling_frequency_(opts.frame_opts.samp_freq) { } + const typename C::Options &opts): + computer_(opts), window_function_(computer_.GetFrameOptions()), + input_finished_(false), waveform_offset_(0) { } template void OnlineGenericBaseFeature::AcceptWaveform(BaseFloat sampling_rate, - const VectorBase &waveform) { - if (waveform.Dim() == 0) { + const VectorBase &waveform) { + BaseFloat expected_sampling_rate = computer_.GetFrameOptions().samp_freq; + if (sampling_rate != expected_sampling_rate) + KALDI_ERR << "Sampling frequency mismatch, expected " + << expected_sampling_rate << ", got " << sampling_rate; + if (waveform.Dim() == 0) return; // Nothing to do. - } - if (input_finished_) { + if (input_finished_) KALDI_ERR << "AcceptWaveform called after InputFinished() was called."; - } - if (sampling_rate != sampling_frequency_) { - KALDI_ERR << "Sampling frequency mismatch, expected " - << sampling_frequency_ << ", got " << sampling_rate; - } - - Vector appended_wave; - - const VectorBase &wave_to_use = (waveform_remainder_.Dim() != 0 ? - appended_wave : waveform); - if (waveform_remainder_.Dim() != 0) { - appended_wave.Resize(waveform_remainder_.Dim() + - waveform.Dim()); + // append 'waveform' to 'waveform_remainder_.' + Vector appended_wave(waveform_remainder_.Dim() + waveform.Dim()); + if (waveform_remainder_.Dim() != 0) appended_wave.Range(0, waveform_remainder_.Dim()).CopyFromVec( waveform_remainder_); - appended_wave.Range(waveform_remainder_.Dim(), - waveform.Dim()).CopyFromVec(waveform); - } - waveform_remainder_.Resize(0); - - Matrix feats; - BaseFloat vtln_warp = 1.0; // We don't support VTLN warping in this wrapper. - mfcc_or_plp_.Compute(wave_to_use, vtln_warp, &feats, &waveform_remainder_); + appended_wave.Range(waveform_remainder_.Dim(), waveform.Dim()).CopyFromVec( + waveform); + waveform_remainder_.Swap(&appended_wave); + ComputeFeatures(); +} - if (feats.NumRows() == 0) { - // Presumably we got a very small waveform and could output no whole - // features. The waveform will have been appended to waveform_remainder_. - return; +template +void OnlineGenericBaseFeature::ComputeFeatures() { + const FrameExtractionOptions &frame_opts = computer_.GetFrameOptions(); + int64 num_samples_total = waveform_offset_ + waveform_remainder_.Dim(); + int32 num_frames_old = features_.size(), + num_frames_new = NumFrames(num_samples_total, frame_opts, + input_finished_); + KALDI_ASSERT(num_frames_new >= num_frames_old); + features_.resize(num_frames_new, NULL); + + Vector window; + bool need_raw_log_energy = computer_.NeedRawLogEnergy(); + for (int32 frame = num_frames_old; frame < num_frames_new; frame++) { + BaseFloat raw_log_energy = 0.0; + ExtractWindow(waveform_offset_, waveform_remainder_, frame, + frame_opts, window_function_, &window, + need_raw_log_energy ? &raw_log_energy : NULL); + Vector *this_feature = new Vector(computer_.Dim(), + kUndefined); + // note: this online feature-extraction code does not support VTLN. + BaseFloat vtln_warp = 1.0; + computer_.Compute(raw_log_energy, vtln_warp, &window, this_feature); + features_[frame] = this_feature; } - int32 new_num_frames = num_frames_ + feats.NumRows(); - BaseFloat increase_ratio = 1.5; // This is a tradeoff between memory and - // compute; it's the factor by which we - // increase the memory used each time. - if (new_num_frames > features_.NumRows()) { - int32 new_num_rows = std::max(new_num_frames, - features_.NumRows() * increase_ratio); - // Increase the size of the features_ matrix and copy over any existing - // data. - features_.Resize(new_num_rows, Dim(), kCopyData); + // OK, we will now discard any portion of the signal that will not be + // necessary to compute frames in the future. + int64 first_sample_of_next_frame = FirstSampleOfFrame(num_frames_new, + frame_opts); + int32 samples_to_discard = first_sample_of_next_frame - waveform_offset_; + if (samples_to_discard > 0) { + // discard the leftmost part of the waveform that we no longer need. + int32 new_num_samples = waveform_remainder_.Dim() - samples_to_discard; + if (new_num_samples <= 0) { + // odd, but we'll try to handle it. + waveform_offset_ += waveform_remainder_.Dim(); + waveform_remainder_.Resize(0); + } else { + Vector new_remainder(new_num_samples); + new_remainder.CopyFromVec(waveform_remainder_.Range(samples_to_discard, + new_num_samples)); + waveform_offset_ += samples_to_discard; + waveform_remainder_.Swap(&new_remainder); + } } - features_.Range(num_frames_, feats.NumRows(), 0, Dim()).CopyFromMat(feats); - num_frames_ = new_num_frames; } // instantiate the templates defined here for MFCC, PLP and filterbank classes. -template class OnlineGenericBaseFeature; -template class OnlineGenericBaseFeature; -template class OnlineGenericBaseFeature; +template class OnlineGenericBaseFeature; +template class OnlineGenericBaseFeature; +template class OnlineGenericBaseFeature; OnlineCmvnState::OnlineCmvnState(const OnlineCmvnState &other): @@ -317,7 +326,7 @@ void OnlineCmvn::GetFrame(int32 frame, if (!skip_dims_.empty()) FakeStatsForSomeDims(skip_dims_, &stats); - + // call the function ApplyCmvn declared in ../transform/cmvn.h, which // requires a matrix. Matrix feat_mat(1, dim); @@ -486,8 +495,7 @@ void OnlineCacheFeature::GetFrame(int32 frame, VectorBase *feat) { void OnlineCacheFeature::ClearCache() { for (size_t i = 0; i < cache_.size(); i++) - if (cache_[i] != NULL) - delete cache_[i]; + delete cache_[i]; cache_.resize(0); } diff --git a/src/feat/online-feature.h b/src/feat/online-feature.h index d0b4d54c256..ba87f696492 100644 --- a/src/feat/online-feature.h +++ b/src/feat/online-feature.h @@ -41,26 +41,34 @@ namespace kaldi { /// @{ - +/// This is a templated class for online feature extraction; +/// it's templated on a class like MfccComputer or PlpComputer +/// that does the basic feature extraction. template class OnlineGenericBaseFeature: public OnlineBaseFeature { public: // // First, functions that are present in the interface: // - virtual int32 Dim() const { return mfcc_or_plp_.Dim(); } - - // Note: this will only ever return true if you call InputFinished(), which - // isn't really necessary to do unless you want to make sure to flush out the - // last few frames of delta or LDA features to exactly match a non-online - // decode of some data. - virtual bool IsLastFrame(int32 frame) const; - virtual int32 NumFramesReady() const { return num_frames_; } + virtual int32 Dim() const { return computer_.Dim(); } + + // Note: IsLastFrame() will only ever return true if you have called + // InputFinished() (and this frame is the last frame). + virtual bool IsLastFrame(int32 frame) const { + return input_finished_ && frame == NumFramesReady() - 1; + } + virtual BaseFloat FrameShiftInSeconds() const { + return computer_.GetFrameOptions().frame_shift_ms * 1.0e-03; + } + + virtual int32 NumFramesReady() const { return features_.size(); } + virtual void GetFrame(int32 frame, VectorBase *feat); - // // Next, functions that are not in the interface. - // + + + // Constructor from options class explicit OnlineGenericBaseFeature(const typename C::Options &opts); // This would be called from the application, when you get @@ -69,42 +77,58 @@ class OnlineGenericBaseFeature: public OnlineBaseFeature { // expected in the options. virtual void AcceptWaveform(BaseFloat sampling_rate, const VectorBase &waveform); - + // InputFinished() tells the class you won't be providing any - // more waveform. This will help flush out the last few frames - // of delta or LDA features. - virtual void InputFinished() { input_finished_= true; } + // more waveform. This will help flush out the last frame or two + // of features, in the case where snip-edges == false; it also + // affects the return value of IsLastFrame(). + virtual void InputFinished() { + input_finished_ = true; + ComputeFeatures(); + } + ~OnlineGenericBaseFeature() { + DeletePointers(&features_); + } private: - C mfcc_or_plp_; // class that does the MFCC or PLP computation + // This function computes any additional feature frames that it is possible to + // compute from 'waveform_remainder_', which at this point may contain more + // than just a remainder-sized quantity (because AcceptWaveform() appends to + // waveform_remainder_ before calling this function). It adds these feature + // frames to features_, and shifts off any now-unneeded samples of input from + // waveform_remainder_ while incrementing waveform_offset_ by the same amount. + void ComputeFeatures(); + + C computer_; // class that does the MFCC or PLP or filterbank computation + + FeatureWindowFunction window_function_; // features_ is the Mfcc or Plp or Fbank features that we have already computed. - Matrix features_; + + std::vector*> features_; // True if the user has called "InputFinished()" bool input_finished_; - // num_frames_ is the number of frames of MFCC features we have - // already computed. It may be less than the size of features_, - // because when we resize that matrix we leave some extra room, - // so that we don't spend too much time resizing. - int32 num_frames_; - // The sampling frequency, extracted from the config. Should // be identical to the waveform supplied. BaseFloat sampling_frequency_; + // waveform_offset_ is the number of samples of waveform that we have + // already discarded, i.e. thatn were prior to 'waveform_remainder_'. + int64 waveform_offset_; + // waveform_remainder_ is a short piece of waveform that we may need to keep // after extracting all the whole frames we can (whatever length of feature // will be required for the next phase of computation). Vector waveform_remainder_; }; -typedef OnlineGenericBaseFeature OnlineMfcc; -typedef OnlineGenericBaseFeature OnlinePlp; -typedef OnlineGenericBaseFeature OnlineFbank; +typedef OnlineGenericBaseFeature OnlineMfcc; +typedef OnlineGenericBaseFeature OnlinePlp; +typedef OnlineGenericBaseFeature OnlineFbank; /// This class takes a Matrix and wraps it as an @@ -119,8 +143,12 @@ class OnlineMatrixFeature: public OnlineFeatureInterface { virtual int32 Dim() const { return mat_.NumCols(); } + virtual BaseFloat FrameShiftInSeconds() const { + return 0.01f; + } + virtual int32 NumFramesReady() const { return mat_.NumRows(); } - + virtual void GetFrame(int32 frame, VectorBase *feat) { feat->CopyFromVec(mat_.Row(frame)); } @@ -129,6 +157,7 @@ class OnlineMatrixFeature: public OnlineFeatureInterface { return (frame + 1 == mat_.NumRows()); } + private: const MatrixBase &mat_; }; @@ -156,7 +185,7 @@ struct OnlineCmvnOptions { // buffer used for caching CMVN stats. std::string skip_dims; // Colon-separated list of dimensions to skip normalization // of, e.g. 13:14:15. - + OnlineCmvnOptions(): cmn_window(600), speaker_frames(600), @@ -166,7 +195,7 @@ struct OnlineCmvnOptions { modulus(20), ring_buffer_size(20), skip_dims("") { } - + void Check() { KALDI_ASSERT(speaker_frames <= cmn_window && global_frames <= speaker_frames && modulus > 0); @@ -225,11 +254,11 @@ struct OnlineCmvnState { global_cmvn_stats(global_stats) { } // Copy constructor - OnlineCmvnState(const OnlineCmvnState &other); + OnlineCmvnState(const OnlineCmvnState &other); void Write(std::ostream &os, bool binary) const; void Read(std::istream &is, bool binary); - + // Use the default assignment operator. }; @@ -242,7 +271,7 @@ struct OnlineCmvnState { We normally only do so in the "online" GMM-based decoding, e.g. in online2bin/online2-wav-gmm-latgen-faster.cc; see also the script steps/online/prepare_online_decoding.sh and steps/online/decode.sh. - + In the steady state (in the middle of a long utterance), this class accumulates CMVN statistics from the previous "cmn_window" frames (default 600 frames, or 6 seconds), and uses these to normalize the mean and possibly @@ -270,13 +299,15 @@ class OnlineCmvn: public OnlineFeatureInterface { virtual bool IsLastFrame(int32 frame) const { return src_->IsLastFrame(frame); } + virtual BaseFloat FrameShiftInSeconds() const { + return src_->FrameShiftInSeconds(); + } // The online cmvn does not introduce any additional latency. virtual int32 NumFramesReady() const { return src_->NumFramesReady(); } virtual void GetFrame(int32 frame, VectorBase *feat); - // // Next, functions that are not in the interface. // @@ -400,6 +431,9 @@ class OnlineSpliceFrames: public OnlineFeatureInterface { virtual bool IsLastFrame(int32 frame) const { return src_->IsLastFrame(frame); } + virtual BaseFloat FrameShiftInSeconds() const { + return src_->FrameShiftInSeconds(); + } virtual int32 NumFramesReady() const; @@ -430,6 +464,9 @@ class OnlineTransform: public OnlineFeatureInterface { virtual bool IsLastFrame(int32 frame) const { return src_->IsLastFrame(frame); } + virtual BaseFloat FrameShiftInSeconds() const { + return src_->FrameShiftInSeconds(); + } virtual int32 NumFramesReady() const { return src_->NumFramesReady(); } @@ -461,6 +498,9 @@ class OnlineDeltaFeature: public OnlineFeatureInterface { virtual bool IsLastFrame(int32 frame) const { return src_->IsLastFrame(frame); } + virtual BaseFloat FrameShiftInSeconds() const { + return src_->FrameShiftInSeconds(); + } virtual int32 NumFramesReady() const; @@ -489,6 +529,9 @@ class OnlineCacheFeature: public OnlineFeatureInterface { virtual bool IsLastFrame(int32 frame) const { return src_->IsLastFrame(frame); } + virtual BaseFloat FrameShiftInSeconds() const { + return src_->FrameShiftInSeconds(); + } virtual int32 NumFramesReady() const { return src_->NumFramesReady(); } @@ -520,6 +563,10 @@ class OnlineAppendFeature: public OnlineFeatureInterface { virtual bool IsLastFrame(int32 frame) const { return (src1_->IsLastFrame(frame) || src2_->IsLastFrame(frame)); } + // Hopefully sources have the same rate + virtual BaseFloat FrameShiftInSeconds() const { + return src1_->FrameShiftInSeconds(); + } virtual int32 NumFramesReady() const { return std::min(src1_->NumFramesReady(), src2_->NumFramesReady()); diff --git a/src/feat/pitch-functions.cc b/src/feat/pitch-functions.cc index 795fab3b2d4..12dd5030184 100644 --- a/src/feat/pitch-functions.cc +++ b/src/feat/pitch-functions.cc @@ -576,6 +576,8 @@ class OnlinePitchFeatureImpl { explicit OnlinePitchFeatureImpl(const PitchExtractionOptions &opts); int32 Dim() const { return 2; } + + BaseFloat FrameShiftInSeconds() const; int32 NumFramesReady() const; @@ -879,6 +881,10 @@ bool OnlinePitchFeatureImpl::IsLastFrame(int32 frame) const { return (input_finished_ && frame + 1 == T); } +BaseFloat OnlinePitchFeatureImpl::FrameShiftInSeconds() const { + return opts_.frame_shift_ms * 1.0e-03; +} + int32 OnlinePitchFeatureImpl::NumFramesReady() const { int32 num_frames = lag_nccf_.size(), latency = frames_latency_; @@ -1171,6 +1177,10 @@ bool OnlinePitchFeature::IsLastFrame(int32 frame) const { return impl_->IsLastFrame(frame); } +BaseFloat OnlinePitchFeature::FrameShiftInSeconds() const { + return impl_->FrameShiftInSeconds(); +} + void OnlinePitchFeature::GetFrame(int32 frame, VectorBase *feat) { impl_->GetFrame(frame, feat); } @@ -1335,8 +1345,6 @@ inline void AppendVector(const VectorBase &src, Vector *dst) { dst->Range(dst->Dim() - src.Dim(), src.Dim()).CopyFromVec(src); } -const int32 OnlineProcessPitch::kRawFeatureDim; - /** Note on the implementation of OnlineProcessPitch: the OnlineFeatureInterface allows random access to features (i.e. not necessarily diff --git a/src/feat/pitch-functions.h b/src/feat/pitch-functions.h index 52b3f815cde..fd9ead0c090 100644 --- a/src/feat/pitch-functions.h +++ b/src/feat/pitch-functions.h @@ -301,6 +301,8 @@ class OnlinePitchFeature: public OnlineBaseFeature { virtual int32 Dim() const { return 2; /* (NCCF, pitch) */ } virtual int32 NumFramesReady() const; + + virtual BaseFloat FrameShiftInSeconds() const; virtual bool IsLastFrame(int32 frame) const; @@ -336,6 +338,9 @@ class OnlineProcessPitch: public OnlineFeatureInterface { else return src_->IsLastFrame(frame - opts_.delay); } + virtual BaseFloat FrameShiftInSeconds() const { + return src_->FrameShiftInSeconds(); + } virtual int32 NumFramesReady() const; @@ -348,7 +353,9 @@ class OnlineProcessPitch: public OnlineFeatureInterface { OnlineFeatureInterface *src); private: - static const int32 kRawFeatureDim = 2; // input: (nccf, pitch) + enum { kRawFeatureDim = 2}; // anonymous enum to define a constant. + // kRawFeatureDim defines the dimension + // of the input: (nccf, pitch) ProcessPitchOptions opts_; OnlineFeatureInterface *src_; diff --git a/src/feat/signal-test.cc b/src/feat/signal-test.cc new file mode 100644 index 00000000000..39a379040b0 --- /dev/null +++ b/src/feat/signal-test.cc @@ -0,0 +1,63 @@ +// feat/signal-test.cc + +// Copyright 2015 Tom Ko + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + +#include "base/kaldi-common.h" +#include "util/common-utils.h" +#include "feat/signal.h" + +namespace kaldi { + +void UnitTestBlockConvolution() { + for (int32 i = 0; i < 5; i++) { + int32 signal_length = 4000000 + Rand() % 400000; + int32 filter_length = 10000 + Rand() % 1000; + Vector signal(signal_length); + Vector filter(filter_length); + signal.SetRandn(); + filter.SetRandn(); + Vector signal_test(signal); + FFTbasedConvolveSignals(filter, &signal_test); + FFTbasedBlockConvolveSignals(filter, &signal); + AssertEqual(signal, signal_test, 0.000001 * signal.Dim()); + } +} + +void UnitTestConvolution() { + for (int32 i = 0; i < 5; i++) { + int32 signal_length = 40000 + Rand() % 4000; + int32 filter_length = 100 + Rand() % 100; + Vector signal(signal_length); + Vector filter(filter_length); + signal.SetRandn(); + filter.SetRandn(); + Vector signal_test(signal); + ConvolveSignals(filter, &signal_test); + FFTbasedBlockConvolveSignals(filter, &signal); + AssertEqual(signal, signal_test, 0.0001 * signal.Dim()); + } +} +} + +int main() { + using namespace kaldi; + UnitTestBlockConvolution(); + UnitTestConvolution(); + KALDI_LOG << "Tests succeeded."; + +} diff --git a/src/feat/signal.cc b/src/feat/signal.cc new file mode 100644 index 00000000000..e8fbb0b84cf --- /dev/null +++ b/src/feat/signal.cc @@ -0,0 +1,123 @@ +// feat/signal.cc + +// Copyright 2015 Tom Ko + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + +#include "base/kaldi-common.h" +#include "util/common-utils.h" +#include "feat/signal.h" + +namespace kaldi { + +void ElementwiseProductOfFft(const Vector &a, Vector *b) { + int32 num_fft_bins = a.Dim() / 2; + for (int32 i = 0; i < num_fft_bins; i++) { + // do complex multiplication + ComplexMul(a(2*i), a(2*i + 1), &((*b)(2*i)), &((*b)(2*i + 1))); + } +} + +void ConvolveSignals(const Vector &filter, Vector *signal) { + int32 signal_length = signal->Dim(); + int32 filter_length = filter.Dim(); + Vector signal_padded(signal_length + filter_length - 1); + signal_padded.SetZero(); + for (int32 i = 0; i < signal_length; i++) { + for (int32 j = 0; j < filter_length; j++) { + signal_padded(i + j) += (*signal)(i) * filter(j); + } + } + signal->CopyFromVec(signal_padded.Range(0, signal_length)); +} + + +void FFTbasedConvolveSignals(const Vector &filter, Vector *signal) { + int32 signal_length = signal->Dim(); + int32 filter_length = filter.Dim(); + + int32 fft_length = RoundUpToNearestPowerOfTwo(signal_length + filter_length - 1); + KALDI_VLOG(1) << "fft_length for full signal convolution is " << fft_length; + + SplitRadixRealFft srfft(fft_length); + + Vector filter_padded(fft_length); + filter_padded.Range(0, filter_length).CopyFromVec(filter); + srfft.Compute(filter_padded.Data(), true); + + Vector signal_padded(fft_length); + signal_padded.Range(0, signal_length).CopyFromVec(*signal); + srfft.Compute(signal_padded.Data(), true); + + ElementwiseProductOfFft(filter_padded, &signal_padded); + + srfft.Compute(signal_padded.Data(), false); + signal_padded.Scale(1.0 / fft_length); + + signal->CopyFromVec(signal_padded.Range(0, signal_length)); +} + +void FFTbasedBlockConvolveSignals(const Vector &filter, Vector *signal) { + int32 signal_length = signal->Dim(); + int32 filter_length = filter.Dim(); + + KALDI_VLOG(1) << "Length of the filter is " << filter_length; + + int32 fft_length = RoundUpToNearestPowerOfTwo(4 * filter_length); + KALDI_VLOG(1) << "Best FFT length is " << fft_length; + + int32 block_length = fft_length - filter_length + 1; + KALDI_VLOG(1) << "Block size is " << block_length; + SplitRadixRealFft srfft(fft_length); + + Vector filter_padded(fft_length); + filter_padded.Range(0, filter_length).CopyFromVec(filter); + srfft.Compute(filter_padded.Data(), true); + + Vector temp_pad(filter_length - 1); + temp_pad.SetZero(); + Vector signal_block_padded(fft_length); + + for (int32 po = 0; po < signal_length; po += block_length) { + // get a block of the signal + int32 process_length = std::min(block_length, signal_length - po); + signal_block_padded.SetZero(); + signal_block_padded.Range(0, process_length).CopyFromVec(signal->Range(po, process_length)); + + srfft.Compute(signal_block_padded.Data(), true); + + ElementwiseProductOfFft(filter_padded, &signal_block_padded); + + srfft.Compute(signal_block_padded.Data(), false); + signal_block_padded.Scale(1.0 / fft_length); + + // combine the block + if (po + block_length < signal_length) { // current block is not the last block + signal->Range(po, block_length).CopyFromVec(signal_block_padded.Range(0, block_length)); + signal->Range(po, filter_length - 1).AddVec(1.0, temp_pad); + temp_pad.CopyFromVec(signal_block_padded.Range(block_length, filter_length - 1)); + } else { + signal->Range(po, signal_length - po).CopyFromVec( + signal_block_padded.Range(0, signal_length - po)); + if (filter_length - 1 < signal_length - po) + signal->Range(po, filter_length - 1).AddVec(1.0, temp_pad); + else + signal->Range(po, signal_length - po).AddVec(1.0, temp_pad.Range(0, signal_length - po)); + } + } +} +} + diff --git a/src/feat/signal.h b/src/feat/signal.h new file mode 100644 index 00000000000..7ff0ce33b52 --- /dev/null +++ b/src/feat/signal.h @@ -0,0 +1,51 @@ +// feat/signal.h + +// Copyright 2015 Tom Ko + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + +#ifndef KALDI_FEAT_SIGNAL_H_ +#define KALDI_FEAT_SIGNAL_H_ + +#include "base/kaldi-common.h" +#include "util/common-utils.h" + +namespace kaldi { + +/* + This function implements a simple non-FFT-based convolution of two signals. + It is suggested to use the FFT-based convolution function which is more + efficient. +*/ +void ConvolveSignals(const Vector &filter, Vector *signal); + +/* + This function implements FFT-based convolution of two signals. + However this should be an inefficient version of BlockConvolveSignals() + as it processes the entire signal with a single FFT. +*/ +void FFTbasedConvolveSignals(const Vector &filter, Vector *signal); + +/* + This function implements FFT-based block convolution of two signals using + overlap-add method. This is an efficient way to evaluate the discrete + convolution of a long signal with a finite impulse response filter. +*/ +void FFTbasedBlockConvolveSignals(const Vector &filter, Vector *signal); + +} // namespace kaldi + +#endif // KALDI_FEAT_SIGNAL_H_ diff --git a/src/feat/sinusoid-detection.cc b/src/feat/sinusoid-detection.cc index 187b94953ac..bf6b0b9e4fe 100644 --- a/src/feat/sinusoid-detection.cc +++ b/src/feat/sinusoid-detection.cc @@ -104,7 +104,7 @@ void SinusoidDetector::QuadraticMaximize( // Also, x1.y2 - y1 = a (x1 - x1^2) + (x1 - 1) c, so // a = ( (x1 y2 - y1) - (x1 - 1) c) / (x1 - x1^2), and // b = y2 - a - c. - BaseFloat c = y0, + BaseFloat c = y0, a = (x1 * y2 - y1 - (x1 - 1.0) * c) / (x1 - x1*x1), b = y2 - a - c; @@ -152,8 +152,8 @@ BaseFloat SinusoidDetector::QuadraticInterpolate( KALDI_ASSERT(x1 >= 0.0 && x1 <= 1.0); if (x1 == 0.0) return y0; else if (x1 == 1.0) return y2; - - BaseFloat c = y0, + + BaseFloat c = y0, a = (x1 * y2 - y1 - (x1 - 1.0) * c) / (x1 - x1*x1), b = y2 - a - c; return a * x * x + b * x + c; @@ -172,7 +172,7 @@ void SinusoidDetector::CreateCosAndSin(BaseFloat samp_freq, BaseFloat *cos_data = cos_vec->Data(), *sin_data = sin_vec->Data(); BaseFloat factor_real = cos(M_2PI * freq / samp_freq), factor_im = sin(M_2PI * freq / samp_freq); - + // process frames in batches of size "batch_size", after which we recompute // the starting point to prevent loss of accuracy due to drift. for (int32 b = 0; b * batch_size < dim; b++) { @@ -191,7 +191,7 @@ void SinusoidDetector::CreateCosAndSin(BaseFloat samp_freq, } SinusoidDetector::SinusoidDetector(BaseFloat samp_freq, - int32 num_samp): + int32 num_samp): samp_freq_(samp_freq), num_samples_(num_samp), num_samples_padded_(RoundUpToNearestPowerOfTwo(num_samp)), @@ -208,14 +208,14 @@ void SinusoidDetector::SelfTest( BaseFloat final_energy) { int32 num_bins = num_samples_padded_ * 2 + 1; - + { BaseFloat cutoff = 0.0; for (int32 k = 0; k <= num_bins; k += 4) cutoff = std::max(cutoff, info[k].energy); BaseFloat energy_upper_bound = factor1_ * cutoff; if (final_energy > energy_upper_bound) { - KALDI_WARN << "Self-testing failed [factor1]: " + KALDI_WARN << "Self-testing failed [factor1]: " << final_energy << " > " << energy_upper_bound << ", num-samples is " << num_samples_ << ", freq/nyquist = " @@ -231,17 +231,17 @@ void SinusoidDetector::SelfTest( cutoff = std::max(cutoff, info[k].energy); BaseFloat energy_upper_bound = factor2_ * cutoff; if (final_energy > energy_upper_bound) { - KALDI_WARN << "Self-testing failed [factor2]: " + KALDI_WARN << "Self-testing failed [factor2]: " << final_energy << " > " << energy_upper_bound << ", num-samples is " << num_samples_ << ", freq/nyquist = " << (final_freq / (samp_freq_ * 0.5)) << "- would require factor2 >= " << (final_energy / cutoff); - + } } - + } @@ -249,7 +249,7 @@ BaseFloat SinusoidDetector::OptimizeFrequency( const std::vector &info, int32 *bin_out, BaseFloat *offset_out) const { - + BaseFloat max_energy = 0.0; *bin_out = -1; int32 max_freq = num_samples_padded_ * 2; @@ -320,20 +320,20 @@ BaseFloat SinusoidDetector::DetectSinusoid( // between bins, with an offset. int32 bin; BaseFloat offset; - + BaseFloat opt_energy = OptimizeFrequency(info, &bin, &offset); if (opt_energy == 0.0) return 0.0; BaseFloat max_freq = (bin + offset) * samp_freq_ / (num_samples_padded_ * 4); - + KALDI_VLOG(4) << "Best frequency based on interpolation is " << max_freq << ", best energy is " << opt_energy << ", bin is " << bin; OptimizedInfo final_info; - + FineOptimizeFrequency(signal, bin, offset, &info, &final_info); // the following while loop will rarely be accessed. @@ -342,7 +342,7 @@ BaseFloat SinusoidDetector::DetectSinusoid( FineOptimizeFrequency(signal, bin, 1.0, &info, &final_info); } - // the following while loop will rarely be accessed. + // the following while loop will rarely be accessed. while (final_info.offset == 1.0 && bin < num_samples_padded_ * 2) { bin++; FineOptimizeFrequency(signal, bin, 0.0, &info, &final_info); @@ -353,9 +353,9 @@ BaseFloat SinusoidDetector::DetectSinusoid( // next-to-highest allowed bin (note, "bin" here is a range, and it can // never have the value num_samples_padded_ * 2), we tend to get more // estimation error than usual, so do another round of optimization. - FineOptimizeFrequency(signal, bin, final_info.offset, &info, &final_info); + FineOptimizeFrequency(signal, bin, final_info.offset, &info, &final_info); } - + BaseFloat final_freq = (final_info.bin + final_info.offset) * samp_freq_ / (num_samples_padded_ * 4); KALDI_VLOG(4) << "Final optimized info is: freq " << final_freq << ", cos coeff " << final_info.cos_coeff << ", sin coeff " @@ -390,12 +390,12 @@ BaseFloat SinusoidDetector::DetectSinusoid( Let the signal, as a vector, be V. We want to maximize the (positive) energy-difference: - ||V||^2 - || V - c C_f - s S_f ||^2 + ||V||^2 - || V - c C_f - s S_f ||^2 where c and s are the coefficients of C_f and S_f. This quantity can be expanded as follows, where . means dot product. \delta E = -c^2 C_f.C_f - s^2 S_f.S_f - 2 c s C_f.S_f + 2 c V.C_f + 2 s V.S_f. which can be written as follows, where . means dot-product and ' means transpose: - \delta E = 2 [c s] v - [c s] M [c s]' + \delta E = 2 [c s] v - [c s] M [c s]' where M = [ C_f.C_f, C_f.S_f, C_f.S_f, S_f.S_f ], and v = [V.C_f, V.S_f]. If M is invertible (i.e. for nonzero frequencies), this is maximized by @@ -451,7 +451,7 @@ void SinusoidDetector::ComputeCoefficients() { int32 num_freq = num_samples_padded_ * 2 + 1; cos_.Resize(num_freq, num_samp); sin_.Resize(num_freq, num_samp); - + Vector cc(num_freq), cs(num_freq); for (int32 k = 0; k < num_freq; k++) { BaseFloat freq = k * samp_freq_ / (num_samples_padded_ * 4); @@ -460,10 +460,10 @@ void SinusoidDetector::ComputeCoefficients() { cc(k) = VecVec(c, c); cs(k) = VecVec(c, s); } - - M_.Resize(num_freq, 3, kUndefined); + + M_.Resize(num_freq, 3, kUndefined); Minv_.Resize(num_freq, 3, kUndefined); - + for (int32 k = 0; k < num_freq; k++) { // Let the matrix M be [ a b; b d ]. [we don't write c because c == b]. // We want to compute Minv_. @@ -503,7 +503,7 @@ void SinusoidDetector::FineOptimizeFrequency( std::vector &info = *info_in; if (!info[bin].valid) ComputeBinInfo(signal, bin, &(info[bin])); if (!info[bin+1].valid) ComputeBinInfo(signal, bin+1, &(info[bin+1])); - + const BaseFloat epsilon = 0.02, delta = 0.001; // If the offset is very close to the edges of the bin, move it @@ -527,16 +527,16 @@ void SinusoidDetector::FineOptimizeFrequency( BaseFloat a = VecVec(c, c), b = VecVec(c, s), d = num_samples_ - a; BaseFloat inv_det = 1.0 / (a * d - b * b); BaseFloat inv_a = d * inv_det, inv_b = -b * inv_det, inv_d = a * inv_det; - + BaseFloat v1 = VecVec(c, signal), v2 = VecVec(s, signal); - + BaseFloat delta_e = v1 * v1 * inv_a + v2 * v2 * inv_d + 2 * v1 * v2 * inv_b; - + KALDI_VLOG(4) << "Actual energy-change at frequency " << freq << " is " << delta_e; // "freq" is frequency somewhere in the middle of the bin. - + BaseFloat final_offset, final_energy; QuadraticMaximize(bin_offset, info[bin].energy, delta_e, info[bin+1].energy, &final_offset, &final_energy); @@ -561,7 +561,7 @@ void SinusoidDetector::FineOptimizeFrequency( // Now get the inverse of the M matrix at the final point. BaseFloat a_inv_interp, b_inv_interp, d_inv_interp; - + if ((bin == 0 && final_offset < delta) || (bin == num_samples_padded_ * 2 && final_offset > 1.0 - delta)) { // If we're extremely close to zero or the Nyquist, we'll have trouble @@ -584,7 +584,7 @@ void SinusoidDetector::FineOptimizeFrequency( info[bin+1].cos_dot, final_offset); BaseFloat v2_interp = QuadraticInterpolate(bin_offset, info[bin].sin_dot, v2, info[bin+1].sin_dot, final_offset); - + opt_info->bin = bin; opt_info->offset = final_offset; // Recompute the energy-reduction using the more accurate interpolated values of @@ -596,7 +596,7 @@ void SinusoidDetector::FineOptimizeFrequency( // Compute the coefficients of the cos and sin in the optimal sinusoid, as // M^{-1} v. opt_info->cos_coeff = a_inv_interp * v1_interp + b_inv_interp * v2_interp; - opt_info->sin_coeff = b_inv_interp * v1_interp + d_inv_interp * v2_interp; + opt_info->sin_coeff = b_inv_interp * v1_interp + d_inv_interp * v2_interp; } void SinusoidDetector::FindCandidateBins( @@ -611,7 +611,7 @@ void SinusoidDetector::FindCandidateBins( KALDI_ASSERT(info[k].valid); cutoff = std::max(cutoff, info[k].energy); } - + for (int32 k = 0; k < max_bin; k += 4) { BaseFloat energy_upper_bound = factor1_ * std::max(info[k].energy, @@ -628,14 +628,14 @@ void SinusoidDetector::FindCandidateBins2( std::vector *bins2) const { int32 max_bin = num_samples_padded_ * 2; - + BaseFloat cutoff = min_energy; for (int32 k = 0; k <= max_bin; k += 2) { if (info[k].valid) cutoff = std::max(cutoff, info[k].energy); } - for (int32 k = 0; k < max_bin; k += 2) { + for (int32 k = 0; k < max_bin; k += 2) { if (info[k].valid && info[k+2].valid) { BaseFloat energy_upper_bound = factor2_ * std::max(info[k].energy, @@ -645,7 +645,7 @@ void SinusoidDetector::FindCandidateBins2( } } } - + void SinusoidDetector::ComputeBinInfo( const VectorBase &signal, @@ -670,8 +670,6 @@ MultiSinusoidDetector::MultiSinusoidDetector( sample_freq_(sampling_freq), samples_per_frame_subsampled_(0.001 * config.frame_length_ms * static_cast(config.subsample_freq)), - samples_shift_subsampled_(0.001 * config.frame_shift_ms * - static_cast(config.subsample_freq)), waveform_finished_(false), samples_consumed_(0), resampler_(sampling_freq, config.subsample_freq, @@ -726,7 +724,7 @@ int32 MultiSinusoidDetector::NumSubsampledSamplesReady(int32 max_samp) const { ((subsampled_signal_.empty() && samples_consumed_ == 0) || (!subsampled_signal_.empty () && samples_consumed_ < subsampled_signal_[0]->Dim()))); - + int32 ans = -samples_consumed_; for (size_t i = 0; i < subsampled_signal_.size(); i++) { ans += subsampled_signal_[i]->Dim(); @@ -787,7 +785,7 @@ void MultiSinusoidDetector::GetNextFrame(MultiSinusoidDetectorOutput *output) { if (signal_energy == 0.0) return; // min_energy1 is the lowest energy we might care about. - BaseFloat min_energy1 = signal_energy * + BaseFloat min_energy1 = signal_energy * std::min(config_.two_freq_min_total_energy * 0.5, config_.one_freq_min_energy); @@ -830,7 +828,7 @@ void MultiSinusoidDetector::GetNextFrame(MultiSinusoidDetectorOutput *output) { << factor << ". (This means sinusoid detection is not " << " working ideally)."; } - + if (DetectedTwoFrequency(signal_energy, sinusoid1, energy1, sinusoid2, energy2, @@ -917,14 +915,14 @@ void DetectSinusoids(const VectorBase &signal, detector->AcceptWaveform(signal); detector->WaveformFinished(); - int32 safety_margin = 10, approx_num_frames = safety_margin + + int32 safety_margin = 10, approx_num_frames = safety_margin + (signal.Dim() / (detector->SamplingFrequency() * detector->FrameShiftSecs())); output_vec.reserve(approx_num_frames); while (!detector->Done()) { output_vec.resize(output_vec.size() + 1); detector->GetNextFrame(&(output_vec.back())); - } + } detector->Reset(); if (output_vec.empty()) { output->Resize(0, 0); diff --git a/src/feat/sinusoid-detection.h b/src/feat/sinusoid-detection.h index 29483fcc30b..f6addc0b530 100644 --- a/src/feat/sinusoid-detection.h +++ b/src/feat/sinusoid-detection.h @@ -55,7 +55,7 @@ class SinusoidDetector { public: SinusoidDetector(BaseFloat samp_freq, int32 num_samp); - + // Detect the dominant sinusoid component in the signal, as long as the // energy-reduction of the signal from subtracting that sinuoid would be >= @@ -65,7 +65,7 @@ class SinusoidDetector { BaseFloat DetectSinusoid(BaseFloat min_energy_change, const VectorBase &signal, Sinusoid *sinusoid); - + // This function does quadratic interpolation for a function that is known at // three equally spaced points [x0 x1 x2] = [0 1 2], and we want the x-value // and corresponding y-value at the maximum of the function within the range @@ -89,7 +89,7 @@ class SinusoidDetector { static BaseFloat QuadraticInterpolate( BaseFloat x1, BaseFloat y0, BaseFloat y1, BaseFloat y2, BaseFloat x); - + private: BaseFloat samp_freq_; @@ -121,14 +121,14 @@ class SinusoidDetector { // containing the values x y z of a symmetric matrix [ a b; b c ]. There is // one of these matrices for each frequency, sampled at one quarter the // spacing of the FFT bins. There is a long comment next to the definition of - // ComputeCoefficients that describes this. + // ComputeCoefficients that describes this. Matrix M_; // Minv_ is the coefficients in the same format as M_, but containing the // corresponding coefficients of the inverse matrix. There is a long comment // next to the definition of ComputeCoefficients that describes this. Matrix Minv_; - + struct InfoForBin { bool valid; @@ -146,9 +146,9 @@ class SinusoidDetector { BaseFloat cos_coeff; BaseFloat sin_coeff; }; - + // Compute the coefficients and energies at the original FFT bins (every - // fourth entry in "info"). + // fourth entry in "info"). void ComputeCoarseInfo(const Vector &fft, std::vector *info) const; @@ -164,11 +164,11 @@ class SinusoidDetector { const std::vector &info, std::vector *bins) const; - + void ComputeBinInfo(const VectorBase &signal, int32 bin, InfoForBin *info) const; - + // For each bin b such that we have valid "info" data for bins b, b+1 and b+2, // does quadratic interpolation to find the maximum predicted energy in the // range [b, b+2]. The location of the maximum predicted energy is output to @@ -186,7 +186,7 @@ class SinusoidDetector { const std::vector &info, int32 *bin_out, BaseFloat *offset_out) const; - + // This function does // (*cos)(t) = cos(2 pi t freq / samp_freq) @@ -195,7 +195,7 @@ class SinusoidDetector { BaseFloat freq, VectorBase *cos, VectorBase *sin); - + // Do fine optimization of the frequency within a bin, given a reasonable // approximate position within it based on interpolation (that should be close // to the optimum). @@ -205,7 +205,7 @@ class SinusoidDetector { BaseFloat offset, std::vector *info, OptimizedInfo *opt_info) const; - + // Computes the coefficients cos_, sin_, and Minv_. void ComputeCoefficients(); @@ -263,7 +263,7 @@ struct MultiSinusoidDetectorConfig { // the following is not critical and is not exported to the // command line. int32 subsample_filter_zeros; - + MultiSinusoidDetectorConfig(): frame_length_ms(20), frame_shift_ms(10), two_freq_min_energy(0.2), two_freq_min_total_energy(0.6), @@ -313,8 +313,8 @@ struct MultiSinusoidDetectorConfig { KALDI_ASSERT(fabs(samples_per_frame_shift - static_cast(samples_per_frame_shift)) < 0.001); - - } + + } }; struct MultiSinusoidDetectorOutput { @@ -338,19 +338,19 @@ class MultiSinusoidDetector { // Initialize sinusoid detector. Sampling frequency must be integer. MultiSinusoidDetector(const MultiSinusoidDetectorConfig &config, - int32 sampling_freq); + int32 sampling_freq); /// This is how the class acccepts its input. You can put the waveform in /// piece by piece, if it's an online application. void AcceptWaveform(const VectorBase &waveform); - + /// The user calls this to announce to the class that the waveform has ended; /// this forces any pending data to be flushed. void WaveformFinished(); /// Resets the state of the class so you can start processing another waveform. - void Reset(); - + void Reset(); + /// This returns true if the class currently has no more data ready to output. bool Done() const; @@ -362,7 +362,7 @@ class MultiSinusoidDetector { BaseFloat FrameShiftSecs() const { return 0.001 * config_.frame_shift_ms; } BaseFloat SamplingFrequency() const { return sample_freq_; } - + private: // Gets the next frame of subsampled signal, and consumes the appropriate // amount of stored data. It is an error to call this if Done() returned @@ -386,23 +386,21 @@ class MultiSinusoidDetector { const Sinusoid &sinusoid2, BaseFloat energy2, MultiSinusoidDetectorOutput *output); - - + + // Returns std::min(max_samp, sum-of-samples-in-subsampled_signal_). // (the std::min is for efficiency so we don't have to visit the // whole list). int32 NumSubsampledSamplesReady(int32 max_samp) const; - + MultiSinusoidDetectorConfig config_; int32 sample_freq_; int32 samples_per_frame_subsampled_; // (samples per frame at subsampled // rate). - int32 samples_shift_subsampled_; // (samples per frame-shift at subsampled - // rate). // True if the user has called WaveformFinished(). bool waveform_finished_; - + // Pieces of the subsampled signal that are awaiting processing. // Normally there will be just one element here, but if someone calls // AcceptWaveform multiple times before getting output, there could @@ -414,12 +412,12 @@ class MultiSinusoidDetector { // (subsampled_signal_.empty() && samples_consumed_ == 0) or // samples_consumed_ < subsampled_signal_[0]->Dim(). int32 samples_consumed_; - - + + // This object is used to subsample the signal. LinearResample resampler_; - // This object is used to detect sinusoids in the subsampled + // This object is used to detect sinusoids in the subsampled // frames. SinusoidDetector detector_; }; diff --git a/src/feat/wave-reader.cc b/src/feat/wave-reader.cc index cb3f287fdd6..389b461d86c 100644 --- a/src/feat/wave-reader.cc +++ b/src/feat/wave-reader.cc @@ -106,7 +106,7 @@ void WaveData::WriteUint16(std::ostream &os, int16 i) { -void WaveData::Read(std::istream &is) { +void WaveData::Read(std::istream &is, ReadDataType read_data) { data_.Resize(0, 0); // clear the data. char tmp[5]; @@ -224,13 +224,26 @@ void WaveData::Read(std::istream &is) { if (std::abs(static_cast(riff_chunk_read) + static_cast(data_chunk_size) - static_cast(riff_chunk_size)) > 1) { - // we allow the size to be off by one, because there is a weirdness in the - // format of RIFF files that means that the input may sometimes be padded - // with 1 unused byte to make the total size even. - KALDI_ERR << "Expected " << riff_chunk_size << " bytes in RIFF chunk, but " - << "after first data block there will be " << riff_chunk_read - << " + " << data_chunk_size << " bytes " - << "(we do not support reading multiple data chunks)."; + // we allow the size to be off by one without warning, because there is a + // weirdness in the format of RIFF files that means that the input may + // sometimes be padded with 1 unused byte to make the total size even. + KALDI_WARN << "Expected " << riff_chunk_size << " bytes in RIFF chunk, but " + << "after first data block there will be " << riff_chunk_read + << " + " << data_chunk_size << " bytes " + << "(we do not support reading multiple data chunks)."; + } + + if (read_data == kLeaveDataUndefined) { + // we won't actually be reading the data- we'll just be faking that we read + // that data, so the caller can get the metadata. + // assume we'd read the same number of bytes that the data-chunk header + // says we'd read. + int32 num_bytes_read = data_chunk_size; + uint32 num_samp = num_bytes_read / block_align; + data_.Resize(num_channels, num_samp, kUndefined); + return; + } else { + KALDI_ASSERT(read_data == kReadData); } std::vector data_pointer_vec; diff --git a/src/feat/wave-reader.h b/src/feat/wave-reader.h index 64e7bd94d4e..0749022f7d7 100644 --- a/src/feat/wave-reader.h +++ b/src/feat/wave-reader.h @@ -37,6 +37,11 @@ // each WAVE chunk has header sub-chunk 'fmt_' // and one or more data sub-chunks 'data' // +// [Note from Dan: to say that the wave format was ever "specified" anywhere is +// not quite right. The guy who invented the wave format attempted to create +// a formal specification but it did not completely make sense. And there +// doesn't seem to be a consensus on what makes a valid wave file, +// particularly where the accuracy of header information is concerned.] */ @@ -59,6 +64,8 @@ const BaseFloat kWaveSampleMax = 32768.0; /// This class's purpose is to read in Wave files. class WaveData { public: + enum ReadDataType { kReadData, kLeaveDataUndefined }; + WaveData(BaseFloat samp_freq, const MatrixBase &data) : data_(data), samp_freq_(samp_freq) {} @@ -67,7 +74,7 @@ class WaveData { /// Read() will throw on error. It's valid to call Read() more than once-- /// in this case it will destroy what was there before. /// "is" should be opened in binary mode. - void Read(std::istream &is); + void Read(std::istream &is, ReadDataType read_data = kReadData); /// Write() will throw on error. os should be opened in binary mode. void Write(std::ostream &os) const; @@ -92,6 +99,11 @@ class WaveData { samp_freq_ = 0.0; } + void Swap(WaveData *other) { + data_.Swap(&(other->data_)); + std::swap(samp_freq_, other->samp_freq_); + } + private: static const uint32 kBlockSize = 1024 * 1024; // Use 1M bytes. Matrix data_; @@ -106,8 +118,11 @@ class WaveData { }; -// Holder class for .wav files that enables us to read (but not write) -// .wav files. c.f. util/kaldi-holder.h +// Holder class for .wav files that enables us to read (but not write) .wav +// files. c.f. util/kaldi-holder.h we don't use the KaldiObjectHolder template +// because we don't want to check for the \0B binary header. We could have faked +// it by pretending to read in the wave data in text mode after failing to find +// the \0B header, but that would have been a little ugly. class WaveHolder { public: typedef WaveData T; @@ -120,8 +135,8 @@ class WaveHolder { t.Write(os); // throws exception on failure. return true; } catch (const std::exception &e) { - KALDI_WARN << "Exception caught in WaveHolder object (writing)."; - if (!IsKaldiError(e.what())) { std::cerr << e.what(); } + KALDI_WARN << "Exception caught in WaveHolder object (writing). " + << e.what(); return false; // write failure. } } @@ -147,12 +162,71 @@ class WaveHolder { t_.Read(is); // throws exception on failure. return true; } catch (const std::exception &e) { - KALDI_WARN << "Exception caught in WaveHolder object (reading)."; - if (!IsKaldiError(e.what())) { std::cerr << e.what(); } + KALDI_WARN << "Exception caught in WaveHolder object (reading). " + << e.what(); return false; // write failure. } } + void Swap(WaveHolder *other) { + t_.Swap(&(other->t_)); + } + + bool ExtractRange(const WaveHolder &other, const std::string &range) { + KALDI_ERR << "ExtractRange is not defined for this type of holder."; + return false; + } + + private: + T t_; +}; + +// This is like WaveHolder but when you just want the metadata- +// it leaves the actual data undefined, it doesn't read it. +class WaveInfoHolder { + public: + typedef WaveData T; + + static bool Write(std::ostream &os, bool binary, const T &t) { + KALDI_ERR << "This holder type does not support writing."; + return true; + } + + void Copy(const T &t) { t_.CopyFrom(t); } + + static bool IsReadInBinary() { return true; } + + void Clear() { t_.Clear(); } + + const T &Value() { return t_; } + + WaveInfoHolder &operator = (const WaveInfoHolder &other) { + t_.CopyFrom(other.t_); + return *this; + } + WaveInfoHolder(const WaveInfoHolder &other): t_(other.t_) {} + + WaveInfoHolder() {} + + bool Read(std::istream &is) { + try { + t_.Read(is, WaveData::kLeaveDataUndefined); // throws exception on failure. + return true; + } catch (const std::exception &e) { + KALDI_WARN << "Exception caught in WaveHolder object (reading). " + << e.what(); + return false; // write failure. + } + } + + void Swap(WaveInfoHolder *other) { + t_.Swap(&(other->t_)); + } + + bool ExtractRange(const WaveInfoHolder &other, const std::string &range) { + KALDI_ERR << "ExtractRange is not defined for this type of holder."; + return false; + } private: T t_; }; diff --git a/src/featbin/Makefile b/src/featbin/Makefile index 0ff5f58904e..8c3592908a8 100644 --- a/src/featbin/Makefile +++ b/src/featbin/Makefile @@ -7,22 +7,23 @@ BINFILES = compute-mfcc-feats compute-plp-feats compute-fbank-feats \ compute-cmvn-stats add-deltas remove-mean apply-cmvn transform-feats \ copy-feats compose-transforms splice-feats extract-segments subset-feats \ feat-to-len feat-to-dim fmpe-apply-transform fmpe-acc-stats fmpe-init \ - fmpe-est fmpe-copy fmpe-sum-accs append-feats extend-transform-dim \ + fmpe-est fmpe-copy fmpe-sum-accs extend-transform-dim \ get-full-lda-mat compute-spectrogram-feats extract-feature-segments \ reverse-feats paste-feats select-feats subsample-feats process-pitch-feats \ interpolate-pitch copy-feats-to-htk copy-feats-to-sphinx extract-rows \ apply-cmvn-sliding compute-cmvn-stats-two-channel compute-kaldi-pitch-feats \ process-kaldi-pitch-feats compare-feats wav-to-duration add-deltas-sdc \ compute-and-process-kaldi-pitch-feats modify-cmvn-stats wav-copy \ - append-vector-to-feats detect-sinusoids + wav-reverberate append-vector-to-feats detect-sinusoids shift-feats \ + concat-feats -OBJFILES = +OBJFILES = TESTFILES = ADDLIBS = ../feat/kaldi-feat.a ../transform/kaldi-transform.a ../gmm/kaldi-gmm.a \ - ../thread/kaldi-thread.a ../tree/kaldi-tree.a ../matrix/kaldi-matrix.a \ - ../util/kaldi-util.a ../base/kaldi-base.a + ../tree/kaldi-tree.a ../matrix/kaldi-matrix.a \ + ../util/kaldi-util.a ../thread/kaldi-thread.a ../base/kaldi-base.a include ../makefiles/default_rules.mk diff --git a/src/featbin/append-feats.cc b/src/featbin/append-feats.cc deleted file mode 100644 index cf373d7a30a..00000000000 --- a/src/featbin/append-feats.cc +++ /dev/null @@ -1,100 +0,0 @@ -// featbin/append-feats.cc - -// Copyright 2012 Petr Motlicek Pawel Swietojanski -// Johns Hopkins University (author: Daniel Povey) - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - -#include "base/kaldi-common.h" -#include "util/common-utils.h" -#include "matrix/kaldi-matrix.h" - - -int main(int argc, char *argv[]) { - try { - using namespace kaldi; - - const char *usage = - "Append 2 feature-streams [and possibly change format]\n" - "Note, this is deprecated; please use paste-feats\n" - "Usage: append-feats [options] \n" - "\n" - "e.g.: append-feats --feats-offset-in1 5 --num-feats-in1 5 scp:list1.scp " - "scp:list2.scp ark:-\n"; - - ParseOptions po(usage); - - bool truncate_frames = false; - - po.Register("truncate-frames", &truncate_frames, "If true, do not treat it " - "as an error when files differ in number of frames, but truncate " - "the longest one."); - - po.Read(argc, argv); - - if (po.NumArgs() != 3) { - po.PrintUsage(); - exit(1); - } - - std::string rspecifier1 = po.GetArg(1), - rspecifier2 = po.GetArg(2), - wspecifier = po.GetArg(3); - - BaseFloatMatrixWriter feats_writer(wspecifier); - SequentialBaseFloatMatrixReader feats_reader1(rspecifier1); - RandomAccessBaseFloatMatrixReader feats_reader2(rspecifier2); - - int32 num_done = 0, num_err = 0; - - for (; !feats_reader1.Done(); feats_reader1.Next()) { - std::string utt = feats_reader1.Key(); - if (!feats_reader2.HasKey(utt)) { - KALDI_WARN << "Could not find features for " << utt << " in " - << rspecifier2 << ": producing no output for the utterance"; - num_err++; - continue; - } - - const Matrix &feats1 = feats_reader1.Value(); - const Matrix &feats2 = feats_reader2.Value(utt); - if (feats1.NumRows() != feats2.NumRows() && !truncate_frames) { - KALDI_WARN << "For utterance " << utt << ", features have different " - << "#frames " << feats1.NumRows() << " vs. " - << feats2.NumRows() << ", producing no output (use " - << "--truncate-frames=true if you want output)"; - num_err++; - continue; - } - int32 num_frames = std::min(feats1.NumRows(), feats2.NumRows()), - dim1 = feats1.NumCols(), dim2 = feats2.NumCols(); - Matrix output(num_frames, dim1 + dim2, kUndefined); - output.Range(0, num_frames, 0, dim1).CopyFromMat( - feats1.Range(0, num_frames, 0, dim1)); - output.Range(0, num_frames, dim1, dim2).CopyFromMat( - feats2.Range(0, num_frames, 0, dim2)); - - feats_writer.Write(utt, output); - num_done++; - } - KALDI_LOG << "Appended " << num_done << " feats; " << num_err - << " with errors."; - return (num_done != 0 ? 0 : 1); - } catch (const std::exception& e) { - std::cerr << e.what(); - return -1; - } -} diff --git a/src/featbin/append-vector-to-feats.cc b/src/featbin/append-vector-to-feats.cc index 58965159fda..5ca6ae97063 100644 --- a/src/featbin/append-vector-to-feats.cc +++ b/src/featbin/append-vector-to-feats.cc @@ -35,7 +35,7 @@ void AppendVectorToFeats(const Matrix &in, 0, in.NumCols()).CopyFromMat(in); out->Range(0, in.NumRows(), in.NumCols(), vec.Dim()).CopyRowsFromVec(vec); -} +} } @@ -44,31 +44,32 @@ int main(int argc, char *argv[]) { try { using namespace kaldi; using namespace std; - + const char *usage = "Append a vector to each row of input feature files\n" "\n" "Usage: append-vector-to-feats \n" - " or: append-feats \n"; - + " or: append-vector-to-feats \n" + "See also: paste-feats, concat-feats\n"; + ParseOptions po(usage); bool binary = true; po.Register("binary", &binary, "If true, output files in binary " "(only relevant for single-file operation, i.e. no tables)"); - + po.Read(argc, argv); - + if (po.NumArgs() != 3) { po.PrintUsage(); exit(1); } - + if (ClassifyRspecifier(po.GetArg(1), NULL, NULL) != kNoRspecifier) { // We're operating on tables, e.g. archives. - - + + string feat_rspecifier = po.GetArg(1); SequentialBaseFloatMatrixReader feat_reader(feat_rspecifier); @@ -77,22 +78,22 @@ int main(int argc, char *argv[]) { string wspecifier = po.GetArg(3); BaseFloatMatrixWriter feat_writer(wspecifier); - + int32 num_done = 0, num_err = 0; // Main loop for (; !feat_reader.Done(); feat_reader.Next()) { string utt = feat_reader.Key(); KALDI_VLOG(2) << "Processing utterance " << utt; - + const Matrix &feats(feat_reader.Value()); - + if (!vec_reader.HasKey(utt)) { KALDI_WARN << "Could not read vector for utterance " << utt; num_err++; - continue; + continue; } const Vector &vec(vec_reader.Value(utt)); - + Matrix output; AppendVectorToFeats(feats, vec, &output); feat_writer.Write(utt, output); @@ -132,7 +133,7 @@ EOF cat < 2.vec [ 0 1 ] EOF -append-vector-to-feats --binary=false 1.mat 2.vec 3a.mat +append-vector-to-feats --binary=false 1.mat 2.vec 3a.mat cat < 3b.mat [ 0 1 2 0 1 3 4 5 0 1 diff --git a/src/featbin/compute-spectrogram-feats.cc b/src/featbin/compute-spectrogram-feats.cc index 42f4eeb3602..3a74eb94b2f 100644 --- a/src/featbin/compute-spectrogram-feats.cc +++ b/src/featbin/compute-spectrogram-feats.cc @@ -118,7 +118,7 @@ int main(int argc, char *argv[]) { SubVector waveform(wave_data.Data(), this_chan); Matrix features; try { - spec.Compute(waveform, &features, NULL); + spec.Compute(waveform, 1.0, &features, NULL); } catch (...) { KALDI_WARN << "Failed to compute features for utterance " << utt; diff --git a/src/featbin/concat-feats.cc b/src/featbin/concat-feats.cc new file mode 100644 index 00000000000..1f926061772 --- /dev/null +++ b/src/featbin/concat-feats.cc @@ -0,0 +1,97 @@ +// featbin/concat-feats.cc + +// Copyright 2013 Johns Hopkins University (Author: Daniel Povey) +// 2015 Tom Ko + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + + +#include "base/kaldi-common.h" +#include "util/common-utils.h" +#include "matrix/kaldi-matrix.h" + +namespace kaldi { + +/* + This function concatenates several sets of feature vectors + to form a longer set. The length of the output will be equal + to the sum of lengths of the inputs but the dimension will be + the same to the inputs. +*/ + +void ConcatFeats(const std::vector > &in, + Matrix *out) { + KALDI_ASSERT(in.size() >= 1); + int32 tot_len = in[0].NumRows(), + dim = in[0].NumCols(); + for (int32 i = 1; i < in.size(); i++) { + KALDI_ASSERT(in[i].NumCols() == dim); + tot_len += in[i].NumRows(); + } + out->Resize(tot_len, dim); + int32 len_offset = 0; + for (int32 i = 0; i < in.size(); i++) { + int32 this_len = in[i].NumRows(); + out->Range(len_offset, this_len, 0, dim).CopyFromMat( + in[i]); + len_offset += this_len; + } +} + + +} + +int main(int argc, char *argv[]) { + try { + using namespace kaldi; + using namespace std; + + const char *usage = + "Concatenate feature files (assuming they have the same dimensions)\n" + "Usage: concat-feats [ ...] \n" + " e.g. concat-feats mfcc/foo.ark:12343 mfcc/foo.ark:56789 -\n" + "See also: copy-feats, append-vector-to-feats, paste-feats\n"; + + ParseOptions po(usage); + + bool binary = true; + po.Register("binary", &binary, "If true, output files in binary " + "(only relevant for single-file operation, i.e. no tables)"); + + po.Read(argc, argv); + + if (po.NumArgs() < 3) { + po.PrintUsage(); + exit(1); + } + + std::vector > feats(po.NumArgs() - 1); + for (int32 i = 1; i < po.NumArgs(); i++) + ReadKaldiObject(po.GetArg(i), &(feats[i-1])); + Matrix output; + ConcatFeats(feats, &output); + std::string output_wxfilename = po.GetArg(po.NumArgs()); + WriteKaldiObject(output, output_wxfilename, binary); + + // This will tend to produce too much output if we have a logging mesage. + // KALDI_LOG << "Wrote concatenated features to " << output_wxfilename; + return 0; + } catch(const std::exception &e) { + std::cerr << e.what(); + return -1; + } +} + diff --git a/src/featbin/copy-feats-to-htk.cc b/src/featbin/copy-feats-to-htk.cc index 4c7834a89a1..ba0711414c5 100644 --- a/src/featbin/copy-feats-to-htk.cc +++ b/src/featbin/copy-feats-to-htk.cc @@ -50,8 +50,8 @@ int main(int argc, char *argv[]) { ParseOptions po(usage); std::string dir_out = "./"; std::string ext_out = "fea"; - int32 sample_period = 10000; - int32 sample_kind = 9; //USER + int32 sample_period = 100000; // 100ns unit : 10ms = 100000, + int32 sample_kind = 9; // USER, /* 0 WAVEFORM sampled waveform 1 LPC linear prediction filter coefficients diff --git a/src/featbin/copy-feats.cc b/src/featbin/copy-feats.cc index 3c123b70d60..258466b4f3b 100644 --- a/src/featbin/copy-feats.cc +++ b/src/featbin/copy-feats.cc @@ -34,7 +34,8 @@ int main(int argc, char *argv[]) { "e.g.: copy-feats ark:- ark,scp:foo.ark,foo.scp\n" " or: copy-feats ark:foo.ark ark,t:txt.ark\n" "See also: copy-matrix, copy-feats-to-htk, copy-feats-to-sphinx, select-feats,\n" - "extract-rows, subset-feats, subsample-feats, splice-feats, append-feats\n"; + "extract-rows, subset-feats, subsample-feats, splice-feats, paste-feats,\n" + "concat-feats\n"; ParseOptions po(usage); bool binary = true; @@ -48,7 +49,7 @@ int main(int argc, char *argv[]) { po.Register("compress", &compress, "If true, write output in compressed form" "(only currently supported for wxfilename, i.e. archive/script," "output)"); - + po.Read(argc, argv); if (po.NumArgs() != 2) { @@ -57,7 +58,7 @@ int main(int argc, char *argv[]) { } int32 num_done = 0; - + if (ClassifyRspecifier(po.GetArg(1), NULL, NULL) != kNoRspecifier) { // Copying tables of features. std::string rspecifier = po.GetArg(1); @@ -101,7 +102,7 @@ int main(int argc, char *argv[]) { return (num_done != 0 ? 0 : 1); } else { KALDI_ASSERT(!compress && "Compression not yet supported for single files"); - + std::string feat_rxfilename = po.GetArg(1), feat_wxfilename = po.GetArg(2); Matrix feat_matrix; diff --git a/src/featbin/extract-feature-segments.cc b/src/featbin/extract-feature-segments.cc index d3b2661b76b..93f599feb3a 100644 --- a/src/featbin/extract-feature-segments.cc +++ b/src/featbin/extract-feature-segments.cc @@ -2,6 +2,7 @@ // Copyright 2009-2011 Microsoft Corporation; Govivace Inc. // 2012-2013 Mirko Hannemann; Arnab Ghoshal +// 2015 Tanel Alumae // See ../../COPYING for clarification regarding multiple authors // @@ -33,50 +34,69 @@ int main(int argc, char *argv[]) { try { using namespace kaldi; - + const char *usage = "Create feature files by segmenting input files.\n" - "Usage: extract-feature-segments [options...] \n" - " (segments-file has lines like: output-utterance-id input-utterance-or-spk-id 1.10 2.36)\n"; + "Usage: " + "extract-feature-segments [options...] " + " \n" + " (segments-file has lines like: " + "output-utterance-id input-utterance-or-spk-id 1.10 2.36)\n"; // construct all the global objects ParseOptions po(usage); BaseFloat min_segment_length = 0.1, // Minimum segment length in seconds. max_overshoot = 0.0; // max time by which last segment can overshoot - BaseFloat samp_freq = 100; // feature sampling frequency (assuming 10ms window shift) + int32 frame_shift = 10; + int32 frame_length = 25; + bool snip_edges = true; // Register the options po.Register("min-segment-length", &min_segment_length, - "Minimum segment length in seconds (reject shorter segments)"); - po.Register("frame-rate", &samp_freq, - "Feature sampling frequency (e.g. 100 for 10ms window shift)"); + "Minimum segment length in seconds (reject shorter segments)"); + po.Register("frame-length", &frame_length, "Frame length in milliseconds"); + po.Register("frame-shift", &frame_shift, "Frame shift in milliseconds"); po.Register("max-overshoot", &max_overshoot, - "End segments overshooting by less (in seconds) are truncated," - " else rejected."); + "End segments overshooting by less (in seconds) are truncated," + " else rejected."); + po.Register("snip-edges", &snip_edges, + "If true, n_frames frames will be snipped from the end of each " + "extracted feature matrix, " + "where n_frames = ceil((frame_length - frame_shift) / frame_shift), " + "This ensures that only the feature vectors that " + "completely fit in the segment are extracted. " + "This makes the extracted segment lengths match the lengths of the " + "features that have been extracted from already segmented audio."); // OPTION PARSING ... // parse options (+filling the registered variables) po.Read(argc, argv); - // number of arguments should be 3(scriptfile,segments file and outputwav write mode) + // number of arguments should be 3 + // (scriptfile, segments file and outputwav write mode) if (po.NumArgs() != 3) { po.PrintUsage(); exit(1); } - - std::string rspecifier = po.GetArg(1); // get script file/feature archive - std::string segments_rxfilename = po.GetArg(2);// get segment file - std::string wspecifier = po.GetArg(3); // get written archive name + std::string rspecifier = po.GetArg(1); // get script file/feature archive + std::string segments_rxfilename = po.GetArg(2); // get segment file + std::string wspecifier = po.GetArg(3); // get written archive name BaseFloatMatrixWriter feat_writer(wspecifier); - RandomAccessBaseFloatMatrixReader feat_reader(rspecifier); + RandomAccessBaseFloatMatrixReader feat_reader(rspecifier); - Input ki(segments_rxfilename); // no binary argment: never binary. + Input ki(segments_rxfilename); // no binary argment: never binary. int32 num_lines = 0, num_success = 0; - + + int32 snip_length = 0; + if (snip_edges) { + snip_length = static_cast(ceil( + 1.0 * (frame_length - frame_shift) / frame_shift)); + } + std::string line; /* read each line from segments file */ while (std::getline(ki.Stream(), line)) { @@ -106,18 +126,20 @@ int main(int argc, char *argv[]) { KALDI_WARN << "Invalid line in segments file [bad end]: " << line; continue; } + // start time must not be negative; start time must not be greater than // end time, except if end time is -1 if (start < 0 || end <= 0 || start >= end) { - KALDI_WARN << "Invalid line in segments file [empty or invalid segment]: " + KALDI_WARN << "Invalid line in segments file " + "[empty or invalid segment]: " << line; continue; } int32 channel = -1; // means channel info is unspecified. // if each line has 5 elements then 5th element must be channel identifier - if(split_line.size() == 5) { + if (split_line.size() == 5) { if (!ConvertStringToInteger(split_line[4], &channel) || channel < 0) { - KALDI_WARN << "Invalid line in segments file [bad channel]: " << line; + KALDI_WARN<< "Invalid line in segments file [bad channel]: " << line; continue; } } @@ -131,45 +153,62 @@ int main(int argc, char *argv[]) { continue; } const Matrix &feats = feat_reader.Value(utterance); - int32 num_samp = feats.NumRows(), // total number of samples present in wav data - num_chan = feats.NumCols(); // total number of channels present in wav file - + // total number of samples present in wav data + int32 num_samp = feats.NumRows(); + // total number of channels present in wav file + int32 num_chan = feats.NumCols(); // Convert start & end times of the segment to corresponding sample number - int32 start_samp = static_cast(start * samp_freq); - int32 end_samp = static_cast(end * samp_freq); + int32 start_samp = static_cast(round( + (start * 1000.0 / frame_shift))); + int32 end_samp = static_cast(round(end * 1000.0 / frame_shift)); + + if (snip_edges) { + // snip the edge at the end of the segment (usually 2 frames), + end_samp -= snip_length; + } + /* start sample must be less than total number of samples * otherwise skip the segment */ if (start_samp < 0 || start_samp >= num_samp) { - KALDI_WARN << "Start sample out of range " << start_samp << " [length:] " - << num_samp << "x" << num_chan << ", skipping segment " << segment; + KALDI_WARN << "Start sample out of range " << start_samp + << " [length:] " << num_samp << "x" << num_chan + << ", skipping segment " << segment; continue; } + /* end sample must be less than total number samples * otherwise skip the segment */ if (end_samp > num_samp) { - if (end_samp >= - num_samp + static_cast(max_overshoot * samp_freq)) { - KALDI_WARN << "End sample too far out of range " << end_samp - << " [length:] " << num_samp << "x" << num_chan << ", skipping segment " - << segment; + if (end_samp >= num_samp + + static_cast( + round(max_overshoot * 1000.0 / frame_shift))) { + KALDI_WARN<< "End sample too far out of range " << end_samp + << " [length:] " << num_samp << "x" << num_chan + << ", skipping segment " + << segment; continue; } - end_samp = num_samp; // for small differences, just truncate. + end_samp = num_samp; // for small differences, just truncate. } + /* check whether the segment size is less than minimum segment length(default 0.1 sec) * if yes, skip the segment */ - if (end_samp <= - start_samp + static_cast(min_segment_length * samp_freq)) { - KALDI_WARN << "Segment " << segment << " too short, skipping it."; + if (end_samp + <= start_samp + + static_cast(round( + (min_segment_length * 1000.0 / frame_shift)))) { + KALDI_WARN<< "Segment " << segment << " too short, skipping it."; continue; } - SubMatrix segment_matrix(feats, start_samp, end_samp-start_samp, 0, num_chan); + SubMatrix segment_matrix(feats, start_samp, + end_samp-start_samp, 0, num_chan); Matrix outmatrix(segment_matrix); - feat_writer.Write(segment, outmatrix); // write segment in feature archive. + // write segment in feature archive. + feat_writer.Write(segment, outmatrix); num_success++; } KALDI_LOG << "Successfully processed " << num_success << " lines out of " diff --git a/src/featbin/extract-segments.cc b/src/featbin/extract-segments.cc index 47afca5668d..f5ed4441a03 100644 --- a/src/featbin/extract-segments.cc +++ b/src/featbin/extract-segments.cc @@ -20,7 +20,6 @@ #include "base/kaldi-common.h" #include "util/common-utils.h" -#include "feat/feature-mfcc.h" #include "feat/wave-reader.h" /*! @brief This is the main program for extracting segments from a wav file @@ -123,7 +122,7 @@ int main(int argc, char *argv[]) { /* check whether a segment start time and end time exists in recording * if fails , skips the segment. */ - if (!reader.HasKey(recording)) { + if (!reader.HasKey(recording)) { KALDI_WARN << "Could not find recording " << recording << ", skipping segment " << segment; continue; diff --git a/src/featbin/paste-feats.cc b/src/featbin/paste-feats.cc index 5eab09d96c1..553bca9064c 100644 --- a/src/featbin/paste-feats.cc +++ b/src/featbin/paste-feats.cc @@ -50,7 +50,7 @@ bool AppendFeats(const std::vector > &in, } if (max_len - min_len > 0) { KALDI_VLOG(2) << "Length mismatch " << max_len << " vs. " << min_len - << (utt.empty() ? "" : " for utt ") << utt + << (utt.empty() ? "" : " for utt ") << utt << " within tolerance " << tolerance; } out->Resize(min_len, tot_dim); @@ -71,7 +71,7 @@ int main(int argc, char *argv[]) { try { using namespace kaldi; using namespace std; - + const char *usage = "Paste feature files (assuming they have the same lengths); think of the\n" "unix command paste a b.\n" @@ -79,8 +79,8 @@ int main(int argc, char *argv[]) { " or: paste-feats [ ...] \n" " e.g. paste-feats ark:feats1.ark \"ark:select-feats 0-3 ark:feats2.ark ark:- |\" ark:feats-out.ark\n" " or: paste-feats foo.mat bar.mat baz.mat\n" - "See also: copy-feats, copy-matrix, append-vector-to-feats, concat-feats\n"; - + "See also: copy-feats, copy-matrix, append-vector-to-feats\n"; + ParseOptions po(usage); int32 length_tolerance = 0; @@ -90,22 +90,22 @@ int main(int argc, char *argv[]) { " difference of length-tolerance, otherwise exclude segment."); po.Register("binary", &binary, "If true, output files in binary " "(only relevant for single-file operation, i.e. no tables)"); - + po.Read(argc, argv); - + if (po.NumArgs() < 3) { po.PrintUsage(); exit(1); } - + if (ClassifyRspecifier(po.GetArg(1), NULL, NULL) != kNoRspecifier) { // We're operating on tables, e.g. archives. - + // Last argument is output string wspecifier = po.GetArg(po.NumArgs()); BaseFloatMatrixWriter feat_writer(wspecifier); - + // First input is sequential string rspecifier1 = po.GetArg(1); SequentialBaseFloatMatrixReader input1(rspecifier1); @@ -117,14 +117,14 @@ int main(int argc, char *argv[]) { RandomAccessBaseFloatMatrixReader *rd = new RandomAccessBaseFloatMatrixReader(rspecifier); input.push_back(rd); } - + int32 num_done = 0, num_err = 0; - + // Main loop for (; !input1.Done(); input1.Next()) { string utt = input1.Key(); KALDI_VLOG(2) << "Merging features for utterance " << utt; - + // Collect features from streams to vector 'feats' vector > feats(po.NumArgs() - 1); feats[0] = input1.Value(); @@ -189,7 +189,7 @@ cat < 2.mat [ 0 1 2 3 ] EOF -paste-feats --length-tolerance=1 --binary=false 1.mat 2.mat 3a.mat +paste-feats --length-tolerance=1 --binary=false 1.mat 2.mat 3a.mat cat < 3b.mat [ 0 1 2 0 1 3 4 5 2 3 ] diff --git a/src/featbin/shift-feats.cc b/src/featbin/shift-feats.cc new file mode 100644 index 00000000000..7b970e92248 --- /dev/null +++ b/src/featbin/shift-feats.cc @@ -0,0 +1,90 @@ +// featbin/shift-feats.cc + +// Copyright 2009-2011 Microsoft Corporation +// 2013-2015 Johns Hopkins University (author: Daniel Povey) + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + +#include "base/kaldi-common.h" +#include "util/common-utils.h" +#include "matrix/kaldi-matrix.h" + + +int main(int argc, char *argv[]) { + try { + using namespace kaldi; + + const char *usage = + "Copy features and possibly shift them in time while maintaining the length, e.g.\n" + "shift-feats --shift=1 will shift all frames to the\n" + "right by one (the first frame would be duplicated).\n" + "See also: copy-feats, copy-matrix\n"; + + ParseOptions po(usage); + int32 shift = 0; + po.Register("shift", &shift, "Number of frames by which to shift the features."); + + po.Read(argc, argv); + + if (po.NumArgs() != 2) { + po.PrintUsage(); + exit(1); + } + + int32 num_done = 0, num_err = 0; + + SequentialBaseFloatMatrixReader feat_reader(po.GetArg(1)); + BaseFloatMatrixWriter feat_writer(po.GetArg(2)); + + + for (; !feat_reader.Done(); feat_reader.Next()) { + const std::string &key = feat_reader.Key(); + const Matrix &src = feat_reader.Value(); + if (src.NumRows() == 0) { + KALDI_WARN << "Empty matrix for key " << key; + num_err++; + continue; + } + Matrix rearranged(src.NumRows(), src.NumCols()); + for (int32 r = 0; r < src.NumRows(); r++) { + int32 src_r = r - shift; + if (src_r < 0) src_r = 0; + if (src_r >= src.NumRows()) src_r = src.NumRows() - 1; + rearranged.Row(r).CopyFromVec(src.Row(src_r)); + } + feat_writer.Write(key, rearranged); + num_done++; + } + + KALDI_LOG << "Shifted " << num_done << " features by " + << shift << " frames; " << num_err << " with errors."; + return (num_done > 0 ? 0 : 1); + } catch(const std::exception &e) { + std::cerr << e.what(); + return -1; + } +} + + +/* +test: + echo "foo [ 1 1; 2 2; 3 3 ]" | shift-feats --shift=1 ark:- ark,t:- + outputs: + foo [ + 1 1 + 1 1 + 2 2 ] +*/ diff --git a/src/featbin/subsample-feats.cc b/src/featbin/subsample-feats.cc index 9a8d5520433..0d79ce5030f 100644 --- a/src/featbin/subsample-feats.cc +++ b/src/featbin/subsample-feats.cc @@ -31,17 +31,17 @@ int main(int argc, char *argv[]) { try { using namespace kaldi; using namespace std; - + const char *usage = - "Sub-samples features by taking every n'th frame." + "Sub-samples features by taking every n'th frame.\n" "With negative values of n, will repeat each frame n times\n" "(e.g. --n=-2 will repeat each frame twice)\n" "\n" "Usage: subsample-feats [options] \n" " e.g. subsample-feats --n=2 ark:- ark:-\n"; - + ParseOptions po(usage); - + int32 n = 1, offset = 0; po.Register("n", &n, "Take every n'th feature, for this value of n" @@ -53,23 +53,23 @@ int main(int argc, char *argv[]) { if (n < 0) KALDI_ASSERT(offset == 0 && "--offset option cannot be used with negative n."); - + po.Read(argc, argv); - + if (po.NumArgs() != 2) { po.PrintUsage(); exit(1); - } + } string rspecifier = po.GetArg(1); string wspecifier = po.GetArg(2); - + SequentialBaseFloatMatrixReader feat_reader(rspecifier); BaseFloatMatrixWriter feat_writer(wspecifier); int32 num_done = 0, num_err = 0; int64 frames_in = 0, frames_out = 0; - + // process all keys for (; !feat_reader.Done(); feat_reader.Next()) { std::string utt = feat_reader.Key(); @@ -85,7 +85,7 @@ int main(int argc, char *argv[]) { frames_in += feats.NumRows(); frames_out += num_indexes; - + if (num_indexes == 0) { KALDI_WARN << "For utterance " << utt << ", output would have no rows, " << "producing no output."; @@ -108,7 +108,7 @@ int main(int argc, char *argv[]) { output.Row(i).CopyFromVec(feats.Row(i / repeat)); frames_in += feats.NumRows(); frames_out += feats.NumRows() * repeat; - feat_writer.Write(utt, output); + feat_writer.Write(utt, output); num_done++; } } diff --git a/src/featbin/wav-reverberate.cc b/src/featbin/wav-reverberate.cc new file mode 100644 index 00000000000..d7599c5ea3d --- /dev/null +++ b/src/featbin/wav-reverberate.cc @@ -0,0 +1,260 @@ +// featbin/wav-reverberate.cc + +// Copyright 2015 Tom Ko + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + +#include "base/kaldi-common.h" +#include "util/common-utils.h" +#include "feat/wave-reader.h" +#include "feat/signal.h" + +namespace kaldi { + +/* + This function is to repeatedly concatenate signal1 by itself + to match the length of signal2 and add the two signals together. +*/ +void AddVectorsOfUnequalLength(const Vector &signal1, Vector *signal2) { + for (int32 po = 0; po < signal2->Dim(); po += signal1.Dim()) { + int32 block_length = signal1.Dim(); + if (signal2->Dim() - po < block_length) block_length = signal2->Dim() - po; + signal2->Range(po, block_length).AddVec(1.0, signal1.Range(0, block_length)); + } +} + +BaseFloat MaxAbsolute(const Vector &vector) { + return std::max(std::abs(vector.Max()), std::abs(vector.Min())); +} + +/* + Early reverberation component of the signal is composed of reflections + within 0.05 seconds of the direct path signal (assumed to be the peak of + the room impulse response). This function returns the energy in + this early reverberation component of the signal. + The input parameters to this function are the room impulse response, the signal + and their sampling frequency respectively. +*/ +BaseFloat ComputeEarlyReverbEnergy(const Vector &rir, const Vector &signal, + BaseFloat samp_freq) { + int32 peak_index = 0; + rir.Max(&peak_index); + KALDI_VLOG(1) << "peak index is " << peak_index; + + const float sec_before_peak = 0.001; + const float sec_after_peak = 0.05; + int32 early_rir_start_index = peak_index - sec_before_peak * samp_freq; + int32 early_rir_end_index = peak_index + sec_after_peak * samp_freq; + if (early_rir_start_index < 0) early_rir_start_index = 0; + if (early_rir_end_index > rir.Dim()) early_rir_end_index = rir.Dim(); + + int32 duration = early_rir_end_index - early_rir_start_index; + Vector early_rir(rir.Range(early_rir_start_index, duration)); + Vector early_reverb(signal); + FFTbasedBlockConvolveSignals(early_rir, &early_reverb); + + // compute the energy + return VecVec(early_reverb, early_reverb) / early_reverb.Dim(); +} + +/* + This is the core function to do reverberation and noise addition + on the given signal. The noise will be scaled before the addition + to match the given signal-to-noise ratio (SNR) and it will also concatenate + itself repeatedly to match the length of the signal. + The input parameters to this function are the room impulse response, + the sampling frequency, the SNR(dB), the noise and the signal respectively. +*/ +void DoReverberation(const Vector &rir, BaseFloat samp_freq, + BaseFloat snr_db, Vector *noise, + Vector *signal) { + if (noise->Dim()) { + float input_power = ComputeEarlyReverbEnergy(rir, *signal, samp_freq); + float noise_power = VecVec(*noise, *noise) / noise->Dim(); + float scale_factor = sqrt(pow(10, -snr_db / 10) * input_power / noise_power); + noise->Scale(scale_factor); + KALDI_VLOG(1) << "Noise signal is being scaled with " << scale_factor + << " to generate output with SNR " << snr_db << "db\n"; + } + + FFTbasedBlockConvolveSignals(rir, signal); + + if (noise->Dim() > 0) { + AddVectorsOfUnequalLength(*noise, signal); + } +} +} + +int main(int argc, char *argv[]) { + try { + using namespace kaldi; + + const char *usage = + "Corrupts the wave files supplied via input pipe with the specified\n" + "room-impulse response (rir_matrix) and additive noise distortions\n" + "(specified by corresponding files).\n" + "Usage: wav-reverberate [options...] " + " \n" + "e.g.\n" + "wav-reverberate --noise-file=noise.wav \\\n" + " input.wav rir.wav output.wav\n"; + + ParseOptions po(usage); + std::string noise_file; + BaseFloat snr_db = 20; + bool multi_channel_output = false; + int32 input_channel = 0; + int32 rir_channel = 0; + int32 noise_channel = 0; + bool normalize_output = true; + BaseFloat volume = 0; + + po.Register("multi-channel-output", &multi_channel_output, + "Specifies if the output should be multi-channel or not"); + po.Register("input-wave-channel", &input_channel, + "Specifies the channel to be used from input as only a " + "single channel will be used to generate reverberated output"); + po.Register("rir-channel", &rir_channel, + "Specifies the channel of the room impulse response, " + "it will only be used when multi-channel-output is false"); + po.Register("noise-channel", &noise_channel, + "Specifies the channel of the noise file, " + "it will only be used when multi-channel-output is false"); + po.Register("noise-file", &noise_file, + "File with additive noise"); + po.Register("snr-db", &snr_db, + "Desired SNR(dB) of the output"); + po.Register("normalize-output", &normalize_output, + "If true, then after reverberating and " + "possibly adding noise, scale so that the signal " + "energy is the same as the original input signal."); + po.Register("volume", &volume, + "If nonzero, a scaling factor on the signal that is applied " + "after reverberating and possibly adding noise. " + "If you set this option to a nonzero value, it will be as" + "if you had also specified --normalize-output=false."); + + po.Read(argc, argv); + if (po.NumArgs() != 3) { + po.PrintUsage(); + exit(1); + } + + if (multi_channel_output) { + if (rir_channel != 0 || noise_channel != 0) + KALDI_WARN << "options for --rir-channel and --noise-channel" + "are ignored as --multi-channel-output is true."; + } + + std::string input_wave_file = po.GetArg(1); + std::string rir_file = po.GetArg(2); + std::string output_wave_file = po.GetArg(3); + + WaveData input_wave; + { + Input ki(input_wave_file); + input_wave.Read(ki.Stream()); + } + + const Matrix &input_matrix = input_wave.Data(); + BaseFloat samp_freq_input = input_wave.SampFreq(); + int32 num_samp_input = input_matrix.NumCols(), // #samples in the input + num_input_channel = input_matrix.NumRows(); // #channels in the input + KALDI_VLOG(1) << "sampling frequency of input: " << samp_freq_input + << " #samples: " << num_samp_input + << " #channel: " << num_input_channel; + KALDI_ASSERT(input_channel < num_input_channel); + + WaveData rir_wave; + { + Input ki(rir_file); + rir_wave.Read(ki.Stream()); + } + const Matrix &rir_matrix = rir_wave.Data(); + BaseFloat samp_freq_rir = rir_wave.SampFreq(); + int32 num_samp_rir = rir_matrix.NumCols(), + num_rir_channel = rir_matrix.NumRows(); + KALDI_VLOG(1) << "sampling frequency of rir: " << samp_freq_rir + << " #samples: " << num_samp_rir + << " #channel: " << num_rir_channel; + if (!multi_channel_output) { + KALDI_ASSERT(rir_channel < num_rir_channel); + } + + Matrix noise_matrix; + if (!noise_file.empty()) { + WaveData noise_wave; + { + Input ki(noise_file); + noise_wave.Read(ki.Stream()); + } + noise_matrix = noise_wave.Data(); + BaseFloat samp_freq_noise = noise_wave.SampFreq(); + int32 num_samp_noise = noise_matrix.NumCols(), + num_noise_channel = noise_matrix.NumRows(); + KALDI_VLOG(1) << "sampling frequency of noise: " << samp_freq_noise + << " #samples: " << num_samp_noise + << " #channel: " << num_noise_channel; + if (multi_channel_output) { + KALDI_ASSERT(num_rir_channel == num_noise_channel); + } else { + KALDI_ASSERT(noise_channel < num_noise_channel); + } + } + + int32 num_output_channels = (multi_channel_output ? num_rir_channel : 1); + Matrix out_matrix(num_output_channels, num_samp_input); + + for (int32 output_channel = 0; output_channel < num_output_channels; output_channel++) { + Vector input(num_samp_input); + input.CopyRowFromMat(input_matrix, input_channel); + float power_before_reverb = VecVec(input, input) / input.Dim(); + + int32 this_rir_channel = (multi_channel_output ? output_channel : rir_channel); + Vector rir(num_samp_rir); + rir.CopyRowFromMat(rir_matrix, this_rir_channel); + rir.Scale(1.0 / (1 << 15)); + + Vector noise(0); + if (!noise_file.empty()) { + noise.Resize(noise_matrix.NumCols()); + int32 this_noise_channel = (multi_channel_output ? output_channel : noise_channel); + noise.CopyRowFromMat(noise_matrix, this_noise_channel); + } + + DoReverberation(rir, samp_freq_rir, snr_db, &noise, &input); + + float power_after_reverb = VecVec(input, input) / input.Dim(); + + if (volume > 0) + input.Scale(volume); + else if (normalize_output) + input.Scale(sqrt(power_before_reverb / power_after_reverb)); + + out_matrix.CopyRowFromVec(input, output_channel); + } + + WaveData out_wave(samp_freq_input, out_matrix); + Output ko(output_wave_file, false); + out_wave.Write(ko.Stream()); + + return 0; + } catch(const std::exception &e) { + std::cerr << e.what(); + return -1; + } +} + diff --git a/src/featbin/wav-to-duration.cc b/src/featbin/wav-to-duration.cc index b0f23f35115..2eb95dc3fc1 100644 --- a/src/featbin/wav-to-duration.cc +++ b/src/featbin/wav-to-duration.cc @@ -30,10 +30,17 @@ int main(int argc, char *argv[]) { "the duration of each one in seconds.\n" "Usage: wav-to-duration [options...] \n" "E.g.: wav-to-duration scp:wav.scp ark,t:-\n" - "See also: wav-copy extract-segments feat-to-len\n"; + "See also: wav-copy extract-segments feat-to-len\n" + "Currently this program may output a lot of harmless warnings regarding\n" + "nonzero exit status of pipes\n"; + + bool read_entire_file = false; ParseOptions po(usage); + po.Register("read-entire-file", &read_entire_file, "If true, use regular WaveHolder " + "instead of WaveInfoHolder to ensure the returned duration is correct."); + po.Read(argc, argv); if (po.NumArgs() != 2) { @@ -48,21 +55,36 @@ int main(int argc, char *argv[]) { double sum_duration = 0.0, min_duration = std::numeric_limits::infinity(), max_duration = 0; - - SequentialTableReader wav_reader(wav_rspecifier); + int32 num_done = 0; + BaseFloatWriter duration_writer(duration_wspecifier); + if (read_entire_file) { + SequentialTableReader wav_reader(wav_rspecifier); + for (; !wav_reader.Done(); wav_reader.Next()) { + std::string key = wav_reader.Key(); + const WaveData &wave_data = wav_reader.Value(); + BaseFloat duration = wave_data.Duration(); + duration_writer.Write(key, duration); - int32 num_done = 0; - for (; !wav_reader.Done(); wav_reader.Next()) { - std::string key = wav_reader.Key(); - const WaveData &wave_data = wav_reader.Value(); - BaseFloat duration = wave_data.Duration(); - duration_writer.Write(key, duration); - - sum_duration += duration; - min_duration = std::min(min_duration, duration); - max_duration = std::max(max_duration, duration); - num_done++; + sum_duration += duration; + min_duration = std::min(min_duration, duration); + max_duration = std::max(max_duration, duration); + num_done++; + } + } + else { + SequentialTableReader wav_reader(wav_rspecifier); + for (; !wav_reader.Done(); wav_reader.Next()) { + std::string key = wav_reader.Key(); + const WaveData &wave_data = wav_reader.Value(); + BaseFloat duration = wave_data.Duration(); + duration_writer.Write(key, duration); + + sum_duration += duration; + min_duration = std::min(min_duration, duration); + max_duration = std::max(max_duration, duration); + num_done++; + } } KALDI_LOG << "Printed duration for " << num_done << " audio files."; diff --git a/src/fgmmbin/Makefile b/src/fgmmbin/Makefile index 49bfa11aade..c8d01e31b6e 100644 --- a/src/fgmmbin/Makefile +++ b/src/fgmmbin/Makefile @@ -7,7 +7,7 @@ BINFILES = fgmm-global-acc-stats fgmm-global-sum-accs fgmm-global-est \ fgmm-global-merge fgmm-global-to-gmm fgmm-gselect fgmm-global-get-frame-likes \ fgmm-global-acc-stats-twofeats fgmm-global-copy fgmm-global-mixdown \ fgmm-global-gselect-to-post fgmm-global-info \ - fgmm-global-acc-stats-post fgmm-global-init-from-accs + fgmm-global-acc-stats-post fgmm-global-init-from-accs OBJFILES = @@ -17,8 +17,8 @@ OBJFILES = TESTFILES = ADDLIBS = ../decoder/kaldi-decoder.a ../lat/kaldi-lat.a ../feat/kaldi-feat.a \ - ../transform/kaldi-transform.a ../gmm/kaldi-gmm.a ../thread/kaldi-thread.a \ + ../transform/kaldi-transform.a ../gmm/kaldi-gmm.a \ ../hmm/kaldi-hmm.a ../tree/kaldi-tree.a ../matrix/kaldi-matrix.a \ - ../util/kaldi-util.a ../base/kaldi-base.a + ../util/kaldi-util.a ../thread/kaldi-thread.a ../base/kaldi-base.a include ../makefiles/default_rules.mk diff --git a/src/fgmmbin/fgmm-global-init-from-accs.cc b/src/fgmmbin/fgmm-global-init-from-accs.cc index def175c3b87..23dc6be75cf 100644 --- a/src/fgmmbin/fgmm-global-init-from-accs.cc +++ b/src/fgmmbin/fgmm-global-init-from-accs.cc @@ -60,8 +60,9 @@ int main(int argc, char *argv[]) { gmm_accs.Read(ki.Stream(), binary, true /* add accs. */); } - int32 num_gauss = gmm_accs.NumGauss(), - dim = gmm_accs.Dim(); + int32 num_gauss = gmm_accs.NumGauss(), dim = gmm_accs.Dim(), + tot_floored = 0, gauss_floored = 0; + FullGmm fgmm(num_components, dim); Vector weights(num_gauss); @@ -85,14 +86,26 @@ int main(int argc, char *argv[]) { SpMatrix covar(gmm_accs.covariance_accumulator()[i]); covar.Scale(1.0 / occ); covar.AddVec2(-1.0, means.Row(i)); // subtract squared means. - covar.Invert(); + // Floor variance Eigenvalues. + BaseFloat floor = std::max( + static_cast(gmm_opts.variance_floor), + static_cast(covar.MaxAbsEig() / gmm_opts.max_condition)); + int32 floored = covar.ApplyFloor(floor); + if (floored) { + tot_floored += floored; + gauss_floored++; + } + covar.InvertDouble(); invcovars.push_back(covar); } fgmm.SetWeights(weights); fgmm.SetInvCovarsAndMeans(invcovars, means); int32 num_bad = fgmm.ComputeGconsts(); KALDI_LOG << "FullGmm has " << num_bad << " bad GConsts"; - + if (tot_floored > 0) { + KALDI_WARN << tot_floored << " variances floored in " << gauss_floored + << " Gaussians."; + } WriteKaldiObject(fgmm, model_out_filename, binary_write); KALDI_LOG << "Written model to " << model_out_filename; diff --git a/src/fstbin/Makefile b/src/fstbin/Makefile index 6c381c48690..6106262859a 100644 --- a/src/fstbin/Makefile +++ b/src/fstbin/Makefile @@ -17,14 +17,14 @@ BINFILES = fstdeterminizestar \ fstdeterminizelog fstphicompose fstrhocompose fstpropfinal fstcopy \ fstpushspecial fsts-to-transcripts -OBJFILES = +OBJFILES = -TESTFILES = +TESTFILES = # actually, this library is currently empty. Everything is a header. -LIBFILE = +LIBFILE = -ADDLIBS = ../fstext/kaldi-fstext.a ../matrix/kaldi-matrix.a ../base/kaldi-base.a \ - ../util/kaldi-util.a +ADDLIBS = ../fstext/kaldi-fstext.a ../util/kaldi-util.a ../thread/kaldi-thread.a \ + ../matrix/kaldi-matrix.a ../base/kaldi-base.a include ../makefiles/default_rules.mk diff --git a/src/fstbin/fstaddselfloops.cc b/src/fstbin/fstaddselfloops.cc index 9219093bee1..96895f23cf4 100644 --- a/src/fstbin/fstaddselfloops.cc +++ b/src/fstbin/fstaddselfloops.cc @@ -45,8 +45,9 @@ int main(int argc, char *argv[]) { "on at least one arc out of the state. Useful in conjunction with predeterminize\n" "\n" "Usage: fstaddselfloops in-disambig-list out-disambig-list [in.fst [out.fst] ]\n" - "E.g: fstaddselfloops in.list out.list < in.fst > withloops.fst\n"; - + "E.g: fstaddselfloops in.list out.list < in.fst > withloops.fst\n" + "in.list and out.list are lists of integers, one per line, of the\n" + "same length.\n"; ParseOptions po(usage); po.Read(argc, argv); @@ -62,12 +63,12 @@ int main(int argc, char *argv[]) { fst_out_filename = po.GetOptArg(4); VectorFst *fst = ReadFstKaldi(fst_in_filename); - + std::vector disambig_in; if (!ReadIntegerVectorSimple(disambig_in_rxfilename, &disambig_in)) KALDI_ERR << "fstaddselfloops: Could not read disambiguation symbols from " << kaldi::PrintableRxfilename(disambig_in_rxfilename); - + std::vector disambig_out; if (!ReadIntegerVectorSimple(disambig_out_rxfilename, &disambig_out)) KALDI_ERR << "fstaddselfloops: Could not read disambiguation symbols from " @@ -81,7 +82,7 @@ int main(int argc, char *argv[]) { WriteFstKaldi(*fst, fst_out_filename); delete fst; - + return 0; } catch(const std::exception &e) { std::cerr << e.what(); diff --git a/src/fstbin/fstrmsymbols.cc b/src/fstbin/fstrmsymbols.cc index 438170c2b98..75f5ab18654 100644 --- a/src/fstbin/fstrmsymbols.cc +++ b/src/fstbin/fstrmsymbols.cc @@ -25,19 +25,62 @@ #include "fstext/fstext-utils.h" #include "fstext/kaldi-fst-io.h" -/* some test examples: - ( echo 3; echo 4) > /tmp/in.list - ( echo "0 0 1 1"; echo " 0 0 3 2"; echo "0 0"; ) | fstcompile | fstrmsymbols /tmp/in.list | fstprint +namespace fst { +// we can move these functions elsewhere later, if they are needed in other +// places. - cd ~/tmpdir - while true; do - fstrand > 1.fst - fstpredeterminize out.lst 1.fst | fstdeterminizestar | fstrmsymbols out.lst > 2.fst - fstequivalent --random=true 1.fst 2.fst || echo "Test failed" - echo -n "." - done +template +void RemoveArcsWithSomeInputSymbols(const std::vector &symbols_in, + VectorFst *fst) { + typedef typename Arc::StateId StateId; + + kaldi::ConstIntegerSet symbol_set(symbols_in); + + StateId num_states = fst->NumStates(); + StateId dead_state = fst->AddState(); + for (StateId s = 0; s < num_states; s++) { + for (MutableArcIterator > iter(fst, s); + !iter.Done(); iter.Next()) { + if (symbol_set.count(iter.Value().ilabel) != 0) { + Arc arc = iter.Value(); + arc.nextstate = dead_state; + iter.SetValue(arc); + } + } + } + // Connect() will actually remove the arcs, and the dead state. + Connect(fst); + if (fst->NumStates() == 0) + KALDI_WARN << "After Connect(), fst was empty."; +} + +template +void PenalizeArcsWithSomeInputSymbols(const std::vector &symbols_in, + float penalty, + VectorFst *fst) { + typedef typename Arc::StateId StateId; + typedef typename Arc::Label Label; + typedef typename Arc::Weight Weight; + + Weight penalty_weight(penalty); + + kaldi::ConstIntegerSet symbol_set(symbols_in); + + StateId num_states = fst->NumStates(); + for (StateId s = 0; s < num_states; s++) { + for (MutableArcIterator > iter(fst, s); + !iter.Done(); iter.Next()) { + if (symbol_set.count(iter.Value().ilabel) != 0) { + Arc arc = iter.Value(); + arc.weight = Times(arc.weight, penalty_weight); + iter.SetValue(arc); + } + } + } +} + +} -*/ int main(int argc, char *argv[]) { try { @@ -45,47 +88,105 @@ int main(int argc, char *argv[]) { using namespace fst; using kaldi::int32; - bool remove_from_output = false; - + bool apply_to_output = false; + bool remove_arcs = false; + float penalty = -std::numeric_limits::infinity(); + const char *usage = - "Replaces a subset of symbols with epsilon, wherever they appear on the input side\n" - "of an FST (or the output side, with --remove-from-output=true)\n" + "With no options, replaces a subset of symbols with epsilon, wherever\n" + "they appear on the input side of an FST." + "With --remove-arcs=true, will remove arcs that contain these symbols\n" + "on the input\n" + "With --penalty=, will add the specified penalty to the\n" + "cost of any arc that has one of the given symbols on its input side\n" + "In all cases, the option --apply-to-output=true (or for\n" + "back-compatibility, --remove-from-output=true) makes this apply\n" + "to the output side.\n" "\n" - "Usage: fstrmsymbols in-disambig-list [in.fst [out.fst] ]\n" - "E.g: fstrmsymbols in.list < in.fst > out.fst\n"; + "Usage: fstrmsymbols [options] [ []]\n" + "E.g: fstrmsymbols in.list < in.fst > out.fst\n" + " is an rxfilename specifying a file containing list of integers\n" + "representing symbols, in text form, one per line.\n"; ParseOptions po(usage); - po.Register("remove-from-output", &remove_from_output, "If true, remove these symbols from " - "the output, not the input, side."); + po.Register("remove-from-output", &apply_to_output, "If true, this applies to symbols " + "on the output, not the input, side. (For back compatibility; use " + "--apply-to-output insead)"); + po.Register("apply-to-output", &apply_to_output, "If true, this applies to symbols " + "on the output, not the input, side."); + po.Register("remove-arcs", &remove_arcs, "If true, instead of converting the symbol " + "to , remove the arcs."); + po.Register("penalty", &penalty, "If specified, instead of converting " + "the symbol to , penalize the arc it is on by adding this " + "value to its cost."); + + po.Read(argc, argv); + if (remove_arcs && + penalty != -std::numeric_limits::infinity()) + KALDI_ERR << "--remove-arc and --penalty options are mutually exclusive"; + if (po.NumArgs() < 1 || po.NumArgs() > 3) { po.PrintUsage(); exit(1); } - + std::string disambig_rxfilename = po.GetArg(1), fst_rxfilename = po.GetOptArg(2), fst_wxfilename = po.GetOptArg(3); VectorFst *fst = ReadFstKaldi(fst_rxfilename); - + std::vector disambig_in; if (!ReadIntegerVectorSimple(disambig_rxfilename, &disambig_in)) KALDI_ERR << "fstrmsymbols: Could not read disambiguation symbols from " << (disambig_rxfilename == "" ? "standard input" : disambig_rxfilename); - if (remove_from_output) Invert(fst); - RemoveSomeInputSymbols(disambig_in, fst); - if (remove_from_output) Invert(fst); - + if (apply_to_output) Invert(fst); + if (remove_arcs) { + RemoveArcsWithSomeInputSymbols(disambig_in, fst); + } else if (penalty != -std::numeric_limits::infinity()) { + PenalizeArcsWithSomeInputSymbols(disambig_in, penalty, fst); + } else { + RemoveSomeInputSymbols(disambig_in, fst); + } + if (apply_to_output) Invert(fst); + WriteFstKaldi(*fst, fst_wxfilename); delete fst; - return 0; + return 0; } catch(const std::exception &e) { std::cerr << e.what(); return -1; } } +/* some test examples: + + ( echo "0 0 1 1"; echo " 0 0 3 2"; echo "0 0"; ) | fstcompile | fstrmsymbols "echo 3; echo 4|" | fstprint + # should produce: + # 0 0 1 1 + # 0 0 0 2 + # 0 + + ( echo "0 0 1 1"; echo " 0 0 3 2"; echo "0 0"; ) | fstcompile | fstrmsymbols --apply-to-output=true "echo 2; echo 3|" | fstprint + # should produce: + # 0 0 1 1 + # 0 0 3 0 + # 0 + + + ( echo "0 0 1 1"; echo " 0 0 3 2"; echo "0 0"; ) | fstcompile | fstrmsymbols --remove-arcs=true "echo 3; echo 4|" | fstprint + # should produce: + # 0 0 1 1 + # 0 + + ( echo "0 0 1 1"; echo " 0 0 3 2"; echo "0 0"; ) | fstcompile | fstrmsymbols --penalty=2 "echo 3; echo 4; echo 5|" | fstprint +# should produce: + # 0 0 1 1 + # 0 0 3 2 2 + # 0 + +*/ diff --git a/src/fstbin/fsts-to-transcripts.cc b/src/fstbin/fsts-to-transcripts.cc index 3190a8e2a86..7c301e10390 100644 --- a/src/fstbin/fsts-to-transcripts.cc +++ b/src/fstbin/fsts-to-transcripts.cc @@ -33,19 +33,19 @@ int main(int argc, char *argv[]) { const char *usage = "Reads a table of FSTs; for each element, finds the best path and prints out the\n" - "output-symbol sequence (if --output-side=true), or input-symbol sequence" + "output-symbol sequence (if --output-side=true), or input-symbol sequence " "otherwise.\n" "\n" - "Usage: fsts-to-transcripts [options] fsts-rspecifier transcriptions-wspecifier\n" + "Usage: fsts-to-transcripts [options] \n" " e.g.: fsts-to-transcripts ark:train.fsts ark,t:train.text\n"; - + ParseOptions po(usage); bool output_side = true; - po.Register("output-side", &output_side, "If true, extract the symbols on the output\n" - "side of the FSTs, else the input side."); - + po.Register("output-side", &output_side, "If true, extract the symbols on " + "the output side of the FSTs, else the input side."); + po.Read(argc, argv); if (po.NumArgs() < 2 || po.NumArgs() > 3) { @@ -64,7 +64,7 @@ int main(int argc, char *argv[]) { for (; !fst_reader.Done(); fst_reader.Next()) { std::string key = fst_reader.Key(); const VectorFst &fst = fst_reader.Value(); - + VectorFst shortest_path; ShortestPath(fst, &shortest_path); // the OpenFst algorithm ShortestPath. @@ -75,7 +75,7 @@ int main(int argc, char *argv[]) { n_err++; continue; } - + std::vector transcript; bool ans; if (output_side) ans = fst::GetLinearSymbolSequence( @@ -90,7 +90,7 @@ int main(int argc, char *argv[]) { } KALDI_LOG << "Converted " << n_done << " FSTs, " << n_err << " with errors"; - return (n_done != 0 ? 0 : 1); + return (n_done != 0 ? 0 : 1); } catch(const std::exception &e) { std::cerr << e.what(); return -1; diff --git a/src/fstext/Makefile b/src/fstext/Makefile index 2b8c54c653f..3c419182684 100644 --- a/src/fstext/Makefile +++ b/src/fstext/Makefile @@ -25,6 +25,6 @@ LIBNAME = kaldi-fstext # tree and matrix archives needed for test-context-fst # matrix archive needed for push-special. ADDLIBS = ../tree/kaldi-tree.a ../matrix/kaldi-matrix.a \ - ../util/kaldi-util.a ../base/kaldi-base.a + ../util/kaldi-util.a ../thread/kaldi-thread.a ../base/kaldi-base.a include ../makefiles/default_rules.mk diff --git a/src/fstext/context-fst-inl.h b/src/fstext/context-fst-inl.h index 889f138e0fa..5127e7ae584 100644 --- a/src/fstext/context-fst-inl.h +++ b/src/fstext/context-fst-inl.h @@ -88,8 +88,8 @@ template ContextFstImpl::ContextFstImpl(const ContextFstImpl &other): phone_syms_(other.phone_syms_), disambig_syms_(other.disambig_syms_) { - std::cerr << "ContextFst copying not yet supported [not hard, but would have to test.]"; - exit(1); + KALDI_ERR << "ContextFst copying not yet supported " + << "[not hard, but would have to test.]"; } @@ -149,7 +149,7 @@ typename ContextFstImpl::Weight ContextFstImpl::Final( assert(static_cast(s) < state_seqs_.size()); // make sure state exists already. if (!this->HasFinal(s)) { // Work out final-state weight. const vector &seq = state_seqs_[s]; - + bool final_ok; assert(static_cast(seq.size()) == N_-1); @@ -198,8 +198,8 @@ size_t ContextFstImpl::NumArcs(StateId s) { } else { // For normal states, in general there is potentially an arc for each phone and an arc // for each disambiguation symbol, plus one for the subsequential symbol. - return phone_syms_.size() + disambig_syms_.size() + 1; - } + return phone_syms_.size() + disambig_syms_.size() + 1; + } } template @@ -310,9 +310,9 @@ bool ContextFstImpl::CreateArc(StateId s, // the output arcs, just 0. return CreatePhoneOrEpsArc(s, nextstate, olabel, phoneseq, oarc); } else { - std::cerr << "ContextFst: CreateArc, invalid olabel supplied [confusion about phone list or disambig symbols?]: "<<(olabel); - exit(1); - } + KALDI_ERR << "ContextFst: CreateArc, invalid olabel supplied [confusion " + << "about phone list or disambig symbols?]: " << olabel; + } return false; // won't get here. suppress compiler error. } @@ -400,7 +400,6 @@ bool ContextMatcher::Find(typename Arc::Label match_label) { template void AddSubsequentialLoop(typename Arc::Label subseq_symbol, MutableFst *fst) { - typedef typename Arc::Label Label; typedef typename Arc::StateId StateId; typedef typename Arc::Weight Weight; @@ -463,30 +462,26 @@ SymbolTable *CreateILabelInfoSymbolTable(const vector > &info, assert(s == 0); for (size_t i = 1; i < info.size(); i++) { if (info[i].size() == 0) { - std::cerr << "CreateILabelInfoSymbolTable: invalid ilabel-info"; - exit(1); + KALDI_ERR << "Invalid ilabel-info"; } if (info[i].size() == 1 && info[i][0] <= 0) { if (info[i][0] == 0) { // special symbol at start that we want to call #-1. s = ans->AddSymbol(initial_disambig); if (s != i) { - std::cerr << "Disambig symbol " << initial_disambig - << " already in vocab\n"; - exit(1); + KALDI_ERR << "Disambig symbol " << initial_disambig + << " already in vocab"; } } else { std::string disambig_sym = phones_symtab.Find(-info[i][0]); if (disambig_sym == "") { - std::cerr << "CreateILabelInfoSymbolTable: disambig symbol " - << -info[i][0] << " not in phone symbol-table."; - exit(1); + KALDI_ERR << "Disambig symbol " << -info[i][0] + << " not in phone symbol-table"; } s = ans->AddSymbol(disambig_sym); if (s != i) { - std::cerr << "Disambig symbol " << disambig_sym - << " already in vocab\n"; - exit(1); + KALDI_ERR << "Disambig symbol " << disambig_sym + << " already in vocab"; } } } else { @@ -495,24 +490,22 @@ SymbolTable *CreateILabelInfoSymbolTable(const vector > &info, for (size_t j = 0; j < info[i].size(); j++) { std::string phonesym = phones_symtab.Find(info[i][j]); if (phonesym == "") { - std::cerr << "CreateILabelInfoSymbolTable: symbol " - << info[i][j] << " not in phone symbol-table."; - exit(1); + KALDI_ERR << "Symbol " << info[i][j] + << " not in phone symbol-table"; } if (j != 0) newsym += separator; newsym += phonesym; } int64 s = ans->AddSymbol(newsym); if (s != static_cast(i)) { - std::cerr << "CreateILabelInfoSymbolTable: some problem with duplicate symbols."; - exit(1); + KALDI_ERR << "Some problem with duplicate symbols"; } } } return ans; } -inline void ComposeContext(vector &disambig_syms_in, +inline void ComposeContext(const vector &disambig_syms_in, int N, int P, VectorFst *ifst, VectorFst *ofst, @@ -532,7 +525,7 @@ inline void ComposeContext(vector &disambig_syms_in, if (!std::binary_search(disambig_syms.begin(), disambig_syms.end(), all_syms[i])) phones.push_back(all_syms[i]); - + // Get subsequential symbol that does not clash with // any disambiguation symbol or symbol in the FST. int32 subseq_sym = 1; @@ -540,7 +533,7 @@ inline void ComposeContext(vector &disambig_syms_in, subseq_sym = std::max(subseq_sym, all_syms.back() + 1); if (!disambig_syms.empty()) subseq_sym = std::max(subseq_sym, disambig_syms.back() + 1); - + // if P == N-1, it's left-context, and no subsequential symbol needed. if (P != N-1) AddSubsequentialLoop(subseq_sym, ifst); @@ -551,8 +544,8 @@ inline void ComposeContext(vector &disambig_syms_in, /// -} // end namespace fst +} // namespace fst -#endif +#endif // KALDI_FSTEXT_CONTEXT_FST_INL_H_ diff --git a/src/fstext/context-fst.h b/src/fstext/context-fst.h index c0b62f00135..15cb0ef9fdb 100644 --- a/src/fstext/context-fst.h +++ b/src/fstext/context-fst.h @@ -274,9 +274,9 @@ class ContextFst : public Fst { virtual uint64 Properties(uint64 mask, bool test) const { if (test) { - uint64 known, test = TestProperties(*this, mask, &known); - impl_->SetProperties(test, known); - return test & mask; + uint64 knownprops, testprops = TestProperties(*this, mask, &knownprops); + impl_->SetProperties(knownprops, testprops); + return testprops & mask; } else { return impl_->Properties(mask); } @@ -310,7 +310,7 @@ class ContextFst : public Fst { ContextFstImpl *impl_; // protected so CacheStateIterator // Makes visible to friends. ContextFstImpl *GetImpl() const { return impl_; } - // would be: ImplToFst >::GetImpl(); + // would be: ImplToFst >::GetImpl(); // but need to convert to using the ImplToFst stuff. void operator = (const ContextFstImpl &fst); // disallow @@ -504,7 +504,7 @@ void ComposeContextFst(const ContextFst &ifst1, const Fst &ifs information to ilabels_out. "ifst" is mutable because we need to add the subsequential loop. */ -inline void ComposeContext(vector &disambig_syms, +inline void ComposeContext(const vector &disambig_syms, int N, int P, VectorFst *ifst, VectorFst *ofst, @@ -534,4 +534,4 @@ void AddSubsequentialLoop(typename Arc::Label subseq_symbol, #include "context-fst-inl.h" -#endif +#endif // KALDI_FSTEXT_CONTEXT_FST_H_ diff --git a/src/fstext/deterministic-fst-inl.h b/src/fstext/deterministic-fst-inl.h index 1af52ce594c..d9099e47ba3 100644 --- a/src/fstext/deterministic-fst-inl.h +++ b/src/fstext/deterministic-fst-inl.h @@ -1,7 +1,8 @@ // fstext/deterministic-fst-inl.h -// Copyright 2011-2012 Gilles Boulianne Johns Hopkins University (author: Daniel Povey) +// Copyright 2011-2012 Gilles Boulianne // 2014 Telepoint Global Hosting Service, LLC. (Author: David Snyder) +// 2012-2015 Johns Hopkins University (author: Daniel Povey) // See ../../COPYING for clarification regarding multiple authors // @@ -311,6 +312,197 @@ bool LmExampleDeterministicOnDemandFst::GetArc( return true; } + +template +void ComposeDeterministicOnDemand(const Fst &fst1, + DeterministicOnDemandFst *fst2, + MutableFst *fst_composed) { + typedef typename Arc::Weight Weight; + typedef typename Arc::StateId StateId; + typedef std::pair StatePair; + typedef unordered_map > MapType; + typedef typename MapType::iterator IterType; + + fst_composed->DeleteStates(); + + MapType state_map; + std::queue state_queue; + + // Set start state in fst_composed. + StateId s1 = fst1.Start(), + s2 = fst2->Start(), + start_state = fst_composed->AddState(); + StatePair start_pair(s1, s2); + state_queue.push(start_pair); + fst_composed->SetStart(start_state); + // A mapping between pairs of states in fst1 and fst2 and the corresponding + // state in fst_composed. + std::pair start_map(start_pair, start_state); + std::pair result = state_map.insert(start_map); + KALDI_ASSERT(result.second == true); + + while (!state_queue.empty()) { + StatePair q = state_queue.front(); + StateId q1 = q.first, + q2 = q.second; + state_queue.pop(); + // If the product of the final weights of the two fsts is non-zero then + // we can set a final-prob in fst_composed + Weight final_weight = Times(fst1.Final(q1), fst2->Final(q2)); + if (final_weight != Weight::Zero()) { + KALDI_ASSERT(state_map.find(q) != state_map.end()); + fst_composed->SetFinal(state_map[q], final_weight); + } + + // for each pair of edges from fst1 and fst2 at q1 and q2. + for (ArcIterator > aiter(fst1, q1); !aiter.Done(); aiter.Next()) { + const Arc &arc1 = aiter.Value(); + Arc arc2; + StatePair next_pair; + StateId next_state1 = arc1.nextstate, + next_state2, + next_state; + // If there is an epsilon on the arc of fst1 we transition to the next + // state but keep fst2 at the current state. + if (arc1.olabel == 0) { + next_state2 = q2; + } else { + bool match = fst2->GetArc(q2, arc1.olabel, &arc2); + if (!match) // There is no matching arc -> nothing to do. + continue; + next_state2 = arc2.nextstate; + } + next_pair = StatePair(next_state1, next_state2); + IterType sitr = state_map.find(next_pair); + // If sitr == state_map.end() then the state isn't in fst_composed yet. + if (sitr == state_map.end()) { + next_state = fst_composed->AddState(); + std::pair new_state( + next_pair, next_state); + std::pair result = state_map.insert(new_state); + // Since we already checked if state_map contained new_state, + // it should always be added if we reach here. + KALDI_ASSERT(result.second == true); + state_queue.push(next_pair); + // If sitr != state_map.end() then the next state is already in + // the state_map. + } else { + next_state = sitr->second; + } + if (arc1.olabel == 0) { + fst_composed->AddArc(state_map[q], Arc(arc1.ilabel, 0, arc1.weight, + next_state)); + } else { + fst_composed->AddArc(state_map[q], Arc(arc1.ilabel, arc2.olabel, + Times(arc1.weight, arc2.weight), next_state)); + } + } + } +} + + +// we are doing *fst_composed = Compose(Inverse(*left), right). +template +void ComposeDeterministicOnDemandInverse(const Fst &right, + DeterministicOnDemandFst *left, + MutableFst *fst_composed) { + typedef typename Arc::Weight Weight; + typedef typename Arc::StateId StateId; + typedef std::pair StatePair; + typedef unordered_map > MapType; + typedef typename MapType::iterator IterType; + + fst_composed->DeleteStates(); + + // the queue and map contain pairs (state-in-left, state-in-right) + MapType state_map; + std::queue state_queue; + + // Set start state in fst_composed. + StateId s_left = left->Start(), + s_right = right.Start(), + start_state = fst_composed->AddState(); + StatePair start_pair(s_left, s_right); + state_queue.push(start_pair); + fst_composed->SetStart(start_state); + // A mapping between pairs of states in *left and right, and the corresponding + // state in fst_composed. + std::pair start_map(start_pair, start_state); + std::pair result = state_map.insert(start_map); + KALDI_ASSERT(result.second == true); + + while (!state_queue.empty()) { + StatePair q = state_queue.front(); + StateId q_left = q.first, + q_right = q.second; + state_queue.pop(); + // If the product of the final weights of the two fsts is non-zero then + // we can set a final-prob in fst_composed + Weight final_weight = Times(left->Final(q_left), right.Final(q_right)); + if (final_weight != Weight::Zero()) { + KALDI_ASSERT(state_map.find(q) != state_map.end()); + fst_composed->SetFinal(state_map[q], final_weight); + } + + for (ArcIterator > aiter(right, q_right); !aiter.Done(); aiter.Next()) { + const Arc &arc_right = aiter.Value(); + Arc arc_left; + StatePair next_pair; + StateId next_state_right = arc_right.nextstate, + next_state_left, + next_state; + // If there is an epsilon on the input side of the rigth arc, we + // transition to the next state of the output but keep 'left' at the + // current state. + if (arc_right.ilabel == 0) { + next_state_left = q_left; + } else { + bool match = left->GetArc(q_left, arc_right.ilabel, &arc_left); + if (!match) // There is no matching arc -> nothing to do. + continue; + // the next 'swap' is because we are composing with the inverse of + // *left. Just removing the swap statement wouldn't let us compose + // with non-inverted *left though, because the GetArc function call + // above interprets the second argument as an ilabel not an olabel. + std::swap(arc_left.ilabel, arc_left.olabel); + next_state_left = arc_left.nextstate; + } + next_pair = StatePair(next_state_left, next_state_right); + IterType sitr = state_map.find(next_pair); + // If sitr == state_map.end() then the state isn't in fst_composed yet. + if (sitr == state_map.end()) { + next_state = fst_composed->AddState(); + std::pair new_state( + next_pair, next_state); + std::pair result = state_map.insert(new_state); + // Since we already checked if state_map contained new_state, + // it should always be added if we reach here. + KALDI_ASSERT(result.second == true); + state_queue.push(next_pair); + // If sitr != state_map.end() then the next state is already in + // the state_map. + } else { + next_state = sitr->second; + } + if (arc_right.ilabel == 0) { + // we didn't get an actual arc from the left FST. + fst_composed->AddArc(state_map[q], Arc(0, arc_right.olabel, + arc_right.weight, + next_state)); + } else { + fst_composed->AddArc(state_map[q], + Arc(arc_left.ilabel, arc_right.olabel, + Times(arc_left.weight, arc_right.weight), + next_state)); + } + } + } +} + + + } // end namespace fst diff --git a/src/fstext/deterministic-fst-test.cc b/src/fstext/deterministic-fst-test.cc index 90b74e27e9c..a041291e427 100644 --- a/src/fstext/deterministic-fst-test.cc +++ b/src/fstext/deterministic-fst-test.cc @@ -109,7 +109,7 @@ StdVectorFst* CreateResultFst() { } void DeleteTestFst(StdVectorFst *fst) { - if (fst) delete fst; + delete fst; } // Follow paths from an input fst representing a string diff --git a/src/fstext/deterministic-fst.h b/src/fstext/deterministic-fst.h index ecb3f9e969b..65ec4685170 100644 --- a/src/fstext/deterministic-fst.h +++ b/src/fstext/deterministic-fst.h @@ -1,7 +1,8 @@ // fstext/deterministic-fst.h -// Copyright 2011-2012 Gilles Boulianne Johns Hopkins University (author: Daniel Povey) +// Copyright 2011-2012 Gilles Boulianne // 2014 Telepoint Global Hosting Service, LLC. (Author: David Snyder) +// 2012-2015 Johns Hopkins University (author: Daniel Povey) // See ../../COPYING for clarification regarding multiple authors // @@ -63,15 +64,12 @@ namespace fst { /// @{ -/// class DeterministicOnDemandFst is an "FST-like" base-class. -/// It does not actually inherit from any Fst class because its -/// interface is not exactly the same (it doesn't have the -/// GetArc function). -/// It assumes that the FST can have only one arc for any -/// given input symbol, which makes the GetArc function below -/// possible. -/// Note: we don't use "const" in this interface, because -/// it creates problems when we do things like caching, +/// class DeterministicOnDemandFst is an "FST-like" base-class. It does not +/// actually inherit from any Fst class because its interface is not exactly the +/// same (it doesn't have the GetArc function). It assumes that the FST can +/// have only one arc for any given input symbol, which makes the GetArc +/// function below possible. Note: we don't use "const" in this interface, +/// because it creates problems when we do things like caching. template class DeterministicOnDemandFst { public: @@ -253,6 +251,37 @@ class LmExampleDeterministicOnDemandFst: public DeterministicOnDemandFst { }; +// Compose an FST (which may be a lattice) with a DeterministicOnDemandFst and +// store the result in fst_composed. This is mainly used for expanding lattice +// n-gram histories, where fst1 is a lattice and fst2 is an UnweightedNgramFst. +// This does not call Connect. +template +void ComposeDeterministicOnDemand(const Fst &fst1, + DeterministicOnDemandFst *fst2, + MutableFst *fst_composed); + +/** + This function does + '*fst_composed = Compose(Inverse(*fst2), fst1)' + Note that the arguments are reversed; this is unfortunate but it's + because the fst2 argument needs to be non-const and non-const arguments + must follow const ones. + This is the counterpart to ComposeDeterministicOnDemand, used for + the case where the DeterministicOnDemandFst is on the left. The + reason why we need to make the left-hand argument to compose the + inverse of 'fst2' (i.e. with the input and output symbols swapped), + is that the DeterministicOnDemandFst interface only supports lookup + by ilabel (see its function GetArc). + This does not call Connect. +*/ +template +void ComposeDeterministicOnDemandInverse(const Fst &fst1, + DeterministicOnDemandFst *fst2, + MutableFst *fst_composed); + + + + /// @} } // namespace fst diff --git a/src/fstext/determinize-lattice-inl.h b/src/fstext/determinize-lattice-inl.h index b41deb980ee..9aff3e774a4 100644 --- a/src/fstext/determinize-lattice-inl.h +++ b/src/fstext/determinize-lattice-inl.h @@ -48,15 +48,15 @@ template class LatticeStringRepository { // Note: all Entry* pointers returned in function calls are // owned by the repository itself, not by the caller! - // Interface guarantees empty string is NULL. - inline const Entry *EmptyString() { return NULL; } + // Interface guarantees empty string is NULL. + inline const Entry *EmptyString() { return NULL; } // Returns string of "parent" with i appended. Pointer // owned by repository const Entry *Successor(const Entry *parent, IntType i) { new_entry_->parent = parent; new_entry_->i = i; - + std::pair pr = set_.insert(new_entry_); if (pr.second) { // Was successfully inserted (was not there). We need to // replace the element we inserted, which resides on the @@ -124,7 +124,7 @@ template class LatticeStringRepository { ans = Successor(ans, a_vec[i]); return ans; } - + // Returns true if a is a prefix of b. If a is prefix of b, @@ -145,7 +145,7 @@ template class LatticeStringRepository { } return ans; } - + void ConvertToVector(const Entry *entry, vector *out) const { size_t length = Size(entry); out->resize(length); @@ -165,9 +165,9 @@ template class LatticeStringRepository { e = Successor(e, vec[i]); return e; } - + LatticeStringRepository() { new_entry_ = new Entry; } - + void Destroy() { for (typename SetType::iterator iter = set_.begin(); iter != set_.end(); @@ -199,13 +199,13 @@ template class LatticeStringRepository { } set_.swap(tmp_set); } - + ~LatticeStringRepository() { Destroy(); } int32 MemSize() const { return set_.size() * sizeof(Entry) * 2; // this is a lower bound // on the size this structure might take. } - private: + private: class EntryKey { // Hash function object. public: inline size_t operator()(const Entry *entry) const { @@ -234,7 +234,7 @@ template class LatticeStringRepository { } } } - + DISALLOW_COPY_AND_ASSIGN(LatticeStringRepository); Entry *new_entry_; // We always have a pre-allocated Entry ready to use, // to avoid unnecessary news and deletes. @@ -263,8 +263,8 @@ template class LatticeDeterminizer { typedef CompactLatticeWeightTpl CompactWeight; typedef ArcTpl CompactArc; // arc in compact, acceptor form of lattice - typedef ArcTpl Arc; // arc in non-compact version of lattice - + typedef ArcTpl Arc; // arc in non-compact version of lattice + // Output to standard FST with CompactWeightTpl as its weight type (the // weight stores the original output-symbol strings). If destroy == true, @@ -427,11 +427,11 @@ template class LatticeDeterminizer { { vector tmp; tmp.swap(queue_); } { vector > tmp; tmp.swap(all_elems_tmp_); } } - + ~LatticeDeterminizer() { FreeMostMemory(); // rest is deleted by destructors. } - void RebuildRepository() { // rebuild the string repository, + void RebuildRepository() { // rebuild the string repository, // freeing stuff we don't need.. we call this when memory usage // passes a supplied threshold. We need to accumulate all the // strings we need the repository to "remember", then tell it @@ -464,7 +464,7 @@ template class LatticeDeterminizer { needed_strings.end()); // uniq the strings. repository_.Rebuild(needed_strings); } - + bool CheckMemoryUsage() { int32 repo_size = repository_.MemSize(), arcs_size = num_arcs_ * sizeof(TempArc), @@ -479,7 +479,7 @@ template class LatticeDeterminizer { KALDI_VLOG(2) << "Rebuilt repository in determinize-lattice: repository shrank from " << repo_size << " to " << new_repo_size << " bytes (approximately)"; - + if (new_total_size > static_cast(opts_.max_mem * 0.8)) { // Rebuilding didn't help enough-- we need a margin to stop // having to rebuild too often. @@ -492,7 +492,7 @@ template class LatticeDeterminizer { } return true; } - + // Returns true on success. Can fail for out-of-memory // or max-states related reasons. bool Determinize(bool *debug_ptr) { @@ -521,12 +521,12 @@ template class LatticeDeterminizer { << repo_size << "," << arcs_size << "," << elems_size << ")"; return (determinized_ = false); } catch (std::runtime_error) { - std::cerr << "Caught exception doing lattice determinization\n"; + KALDI_WARN << "Caught exception doing lattice determinization"; return (determinized_ = false); - } + } } private: - + typedef typename Arc::Label Label; typedef typename Arc::StateId StateId; // use this when we don't know if it's input or output. typedef typename Arc::StateId InputStateId; // state in the input FST. @@ -547,6 +547,10 @@ template class LatticeDeterminizer { return (state != other.state || string != other.string || weight != other.weight); } + // This operator is only intended to support sorting in EpsilonClosure() + bool operator < (const Element &other) const { + return state < other.state; + } }; // Arcs in the format we temporarily create in this class (a representation, essentially of @@ -635,7 +639,7 @@ template class LatticeDeterminizer { // these types are the same anyway]. typedef unordered_map*, Element, SubsetKey, SubsetEqual> InitialSubsetHash; - + // converts the representation of the subset from canonical (all states) to // minimal (only states with output symbols on arcs leaving them, and final @@ -653,7 +657,7 @@ template class LatticeDeterminizer { } subset->resize(cur_out - subset->begin()); } - + // Takes a minimal, normalized subset, and converts it to an OutputStateId. // Involves a hash lookup, and possibly adding a new OutputStateId. // If it creates a new OutputStateId, it adds it to the queue. @@ -672,7 +676,7 @@ template class LatticeDeterminizer { return ans; } - + // Given a normalized initial subset of elements (i.e. before epsilon closure), // compute the corresponding output-state. OutputStateId InitialToStateId(const vector &subset_in, @@ -685,7 +689,7 @@ template class LatticeDeterminizer { *remaining_weight = elem.weight; *common_prefix = elem.string; if (elem.weight == Weight::Zero()) - std::cerr << "Zero weight!\n"; // TEMP + KALDI_WARN << "Zero weight!"; // TEMP return elem.state; } // else no matching subset-- have to work it out. @@ -698,17 +702,17 @@ template class LatticeDeterminizer { ConvertToMinimal(&subset); // remove all but emitting and final states. Element elem; // will be used to store remaining weight and string, and - // OutputStateId, in initial_hash_; + // OutputStateId, in initial_hash_; NormalizeSubset(&subset, &elem.weight, &elem.string); // normalize subset; put // common string and weight in "elem". The subset is now a minimal, // normalized subset. - + OutputStateId ans = MinimalToStateId(subset); *remaining_weight = elem.weight; *common_prefix = elem.string; if (elem.weight == Weight::Zero()) - std::cerr << "Zero weight!\n"; // TEMP - + KALDI_WARN << "Zero weight!"; // TEMP + // Before returning "ans", add the initial subset to the hash, // so that we can bypass the epsilon-closure etc., next time // we process the same initial subset. @@ -748,8 +752,8 @@ template class LatticeDeterminizer { assert(0); // because we checked if a_str == b_str above, shouldn't reach here return 0; } - - + + // This function computes epsilon closure of subset of states by following epsilon links. // Called by InitialToStateId and Initialize. // Has no side effects except on the string repository. The "output_subset" is not @@ -759,37 +763,26 @@ template class LatticeDeterminizer { // at input, subset must have only one example of each StateId. [will still // be so at output]. This function follows input-epsilons, and augments the // subset accordingly. - + + std::deque queue; unordered_map cur_subset; - typedef typename unordered_map::iterator MapIter; + typedef typename unordered_map::iterator MapIter; + typedef typename vector::const_iterator VecIter; - { - MapIter iter = cur_subset.end(); - for (size_t i = 0;i < subset->size();i++) { - std::pair pr((*subset)[i].state, (*subset)[i]); -#if __GNUC__ == 4 && __GNUC_MINOR__ == 0 - iter = cur_subset.insert(iter, pr).first; -#else - iter = cur_subset.insert(iter, pr); -#endif - // By providing iterator where we inserted last one, we make insertion more efficient since - // input subset was already in sorted order. - } + for (VecIter iter = subset->begin(); iter != subset->end(); ++iter) { + queue.push_back(*iter); + cur_subset[iter->state] = *iter; } - // find whether input fst is known to be sorted on input label. - bool sorted = ((ifst_->Properties(kILabelSorted, false) & kILabelSorted) != 0); - std::deque queue; - for (typename vector::const_iterator iter = subset->begin(); - iter != subset->end(); - ++iter) queue.push_back(*iter); + // find whether input fst is known to be sorted on input label. + bool sorted = ((ifst_->Properties(kILabelSorted, false) & kILabelSorted) != 0); bool replaced_elems = false; // relates to an optimization, see below. int counter = 0; // stops infinite loops here for non-lattice-determinizable input; // useful in testing. while (queue.size() != 0) { Element elem = queue.front(); queue.pop_front(); - + // The next if-statement is a kind of optimization. It's to prevent us // unnecessarily repeating the processing of a state. "cur_subset" always // contains only one Element with a particular state. The issue is that @@ -801,8 +794,7 @@ template class LatticeDeterminizer { continue; if (opts_.max_loop > 0 && counter++ > opts_.max_loop) { KALDI_ERR << "Lattice determinization aborted since looped more than " - << opts_.max_loop << " times during epsilon closure.\n"; - throw std::runtime_error("looped more than max-arcs times in lattice determinization"); + << opts_.max_loop << " times during epsilon closure"; } for (ArcIterator > aiter(*ifst_, elem.state); !aiter.Done(); aiter.Next()) { const Arc &arc = aiter.Value(); @@ -818,9 +810,8 @@ template class LatticeDeterminizer { next_elem.string = elem.string; else next_elem.string = repository_.Successor(elem.string, arc.olabel); - - typename unordered_map::iterator - iter = cur_subset.find(next_elem.state); + + MapIter iter = cur_subset.find(next_elem.state); if (iter == cur_subset.end()) { // was no such StateId: insert and add to queue. cur_subset[next_elem.state] = next_elem; @@ -843,12 +834,13 @@ template class LatticeDeterminizer { } } - { // copy cur_subset to subset. - // sorted order is automatic. + { // copy cur_subset to subset. subset->clear(); subset->reserve(cur_subset.size()); MapIter iter = cur_subset.begin(), end = cur_subset.end(); for (; iter != end; ++iter) subset->push_back(iter->second); + // sort by state ID, because the subset hash function is order-dependent(see SubsetKey) + std::sort(subset->begin(), subset->end()); } } @@ -889,7 +881,7 @@ template class LatticeDeterminizer { temp_arc.string = final_string; temp_arc.weight = final_weight; output_arcs_[output_state].push_back(temp_arc); - num_arcs_++; + num_arcs_++; } } @@ -900,7 +892,7 @@ template class LatticeDeterminizer { Weight *tot_weight, StringId *common_str) { if(elems->empty()) { // just set common_str, tot_weight - std::cerr << "[empty subset]\n"; // TEMP + KALDI_WARN << "[empty subset]"; // TEMP // to defaults and return... *common_str = repository_.EmptyString(); *tot_weight = Weight::Zero(); @@ -910,14 +902,14 @@ template class LatticeDeterminizer { vector common_prefix; repository_.ConvertToVector((*elems)[0].string, &common_prefix); Weight weight = (*elems)[0].weight; - for(size_t i = 1; i < size; i++) { + for (size_t i = 1; i < size; i++) { weight = Plus(weight, (*elems)[i].weight); repository_.ReduceToCommonPrefix((*elems)[i].string, &common_prefix); } assert(weight != Weight::Zero()); // we made sure to ignore arcs with zero // weights on them, so we shouldn't have zero here. size_t prefix_len = common_prefix.size(); - for(size_t i = 0; i < size; i++) { + for (size_t i = 0; i < size; i++) { (*elems)[i].weight = Divide((*elems)[i].weight, weight, DIVIDE_LEFT); (*elems)[i].string = repository_.RemovePrefix((*elems)[i].string, prefix_len); @@ -931,11 +923,11 @@ template class LatticeDeterminizer { // (weight, string) pair in the semiring). void MakeSubsetUnique(vector *subset) { typedef typename vector::iterator IterType; - + // This assert is designed to fail (usually) if the subset is not sorted on // state. assert(subset->size() < 2 || (*subset)[0].state <= (*subset)[1].state); - + IterType cur_in = subset->begin(), cur_out = cur_in, end = subset->end(); size_t num_out = 0; // Merge elements with same state-id @@ -958,7 +950,7 @@ template class LatticeDeterminizer { } subset->resize(num_out); } - + // ProcessTransition is called from "ProcessTransitions". Broken out for // clarity. Processes a transition from state "state". The set of Elements // represents a set of next-states with associated weights and strings, each @@ -969,7 +961,7 @@ template class LatticeDeterminizer { // semiring). void ProcessTransition(OutputStateId state, Label ilabel, vector *subset) { MakeSubsetUnique(subset); // remove duplicates with the same state. - + StringId common_str; Weight tot_weight; NormalizeSubset(subset, &tot_weight, &common_str); @@ -978,13 +970,13 @@ template class LatticeDeterminizer { { Weight next_tot_weight; StringId next_common_str; - nextstate = InitialToStateId(*subset, + nextstate = InitialToStateId(*subset, &next_tot_weight, &next_common_str); common_str = repository_.Concatenate(common_str, next_common_str); tot_weight = Times(tot_weight, next_tot_weight); } - + // Now add an arc to the next state (would have been created if necessary by // InitialToStateId). TempArc temp_arc; @@ -998,7 +990,7 @@ template class LatticeDeterminizer { // "less than" operator for pair. Used in ProcessTransitions. - // Lexicographical order, which only compares the state when ordering the + // Lexicographical order, which only compares the state when ordering the // "Element" member of the pair. class PairComparator { @@ -1022,7 +1014,7 @@ template class LatticeDeterminizer { // with the same ilabel. // Side effects on repository, and (via ProcessTransition) on Q_, hash_, // and output_arcs_. - + void ProcessTransitions(OutputStateId output_state) { const vector &minimal_subset = *(output_states_[output_state]); // it's possible that minimal_subset could be empty if there are @@ -1046,7 +1038,7 @@ template class LatticeDeterminizer { next_elem.weight = Times(elem.weight, arc.weight); if (arc.olabel == 0) // output epsilon next_elem.string = elem.string; - else + else next_elem.string = repository_.Successor(elem.string, arc.olabel); all_elems.push_back(this_pr); } @@ -1083,29 +1075,28 @@ template class LatticeDeterminizer { ProcessFinal(output_state); ProcessTransitions(output_state); } - + void Debug() { // this function called if you send a signal // SIGUSR1 to the process (and it's caught by the handler in // fstdeterminizestar). It prints out some traceback // info and exits. - std::cerr << "Debug function called (probably SIGUSR1 caught).\n"; + KALDI_WARN << "Debug function called (probably SIGUSR1 caught)"; // free up memory from the hash as we need a little memory { MinimalSubsetHash hash_tmp; hash_tmp.swap(minimal_hash_); } if (output_arcs_.size() <= 2) { - std::cerr << "Nothing to trace back"; - exit(1); + KALDI_ERR << "Nothing to trace back"; } - size_t max_state = output_arcs_.size() - 2; // don't take the last + size_t max_state = output_arcs_.size() - 2; // Don't take the last // one as we might be halfway into constructing it. vector predecessor(max_state+1, kNoStateId); for (size_t i = 0; i < max_state; i++) { for (size_t j = 0; j < output_arcs_[i].size(); j++) { OutputStateId nextstate = output_arcs_[i][j].nextstate; - // always find an earlier-numbered prececessor; this + // Always find an earlier-numbered predecessor; this // is always possible because of the way the algorithm // works. if (nextstate <= max_state && nextstate > i) @@ -1113,8 +1104,8 @@ template class LatticeDeterminizer { } } vector > traceback; - // traceback is a pair of (ilabel, olabel-seq). - OutputStateId cur_state = max_state; // a recently constructed state. + // 'traceback' is a pair of (ilabel, olabel-seq). + OutputStateId cur_state = max_state; // A recently constructed state. while (cur_state != 0 && cur_state != kNoStateId) { OutputStateId last_state = predecessor[cur_state]; @@ -1128,23 +1119,25 @@ template class LatticeDeterminizer { break; } } - assert(i != output_arcs_[last_state].size()); // or fell off loop. + KALDI_ASSERT(i != output_arcs_[last_state].size()); // Or fell off loop. cur_state = last_state; } - if (cur_state == kNoStateId) - std::cerr << "Traceback did not reach start state (possibly debug-code error)"; + if (cur_state == kNoStateId) + KALDI_WARN << "Traceback did not reach start state " + << "(possibly debug-code error)"; - std::cerr << "Traceback below (or on standard error) in format ilabel (olabel olabel) ilabel (olabel) ...\n"; + std::stringstream ss; + ss << "Traceback follows in format " + << "ilabel (olabel olabel) ilabel (olabel) ... :"; for (ssize_t i = traceback.size() - 1; i >= 0; i--) { - std::cerr << traceback[i].first << ' ' << "( "; + ss << ' ' << traceback[i].first << " ( "; vector